From 5242de617fa27a700fac970bd74be4c54826644b Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Tue, 17 Dec 2024 14:53:19 +0000 Subject: [PATCH 01/13] Add corpora key to the /config response json --- app/models/metadata.py | 15 ++++++ app/repository/corpus.py | 47 +++++++++++++++++ app/repository/lookups.py | 2 + .../non_search/routers/lookups/test_config.py | 52 +++++++++++++++---- 4 files changed, 106 insertions(+), 10 deletions(-) create mode 100644 app/repository/corpus.py diff --git a/app/models/metadata.py b/app/models/metadata.py index ab278085..ea4db29d 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -25,6 +25,20 @@ class OrganisationConfig(BaseModel): count_by_category: Mapping[str, int] +class CorpusConfig(BaseModel): + """Contains the Corpus, Organisation and CorpusType info + as well as stats used on homepage""" + + corpus_import_id: str + title: str + description: str + corpus_type: str + corpus_type_description: str + taxonomy: TaxonomyData + text: str + image_url: str + + class ApplicationConfig(BaseModel): """Definition of the new Config which just includes taxonomy.""" @@ -32,3 +46,4 @@ class ApplicationConfig(BaseModel): organisations: Mapping[str, OrganisationConfig] languages: Mapping[str, str] document_variants: Sequence[str] + corpora: Sequence[CorpusConfig] diff --git a/app/repository/corpus.py b/app/repository/corpus.py new file mode 100644 index 00000000..8b0d5a36 --- /dev/null +++ b/app/repository/corpus.py @@ -0,0 +1,47 @@ +from db_client.models.dfce.family import Corpus +from db_client.models.organisation import CorpusType +from sqlalchemy.orm import Session + +from app import config +from app.models.metadata import CorpusConfig + + +def _to_corpus_data(row) -> CorpusConfig: + image_url = ( + f"https://{config.CDN_DOMAIN}/{row.image_url}" + if row.image_url is not None and len(row.image_url) > 0 + else "" + ) + corpus_text = row.text if row.text is not None else "" + return CorpusConfig( + corpus_import_id=row.corpus_import_id, + title=row.title, + description=row.description, + corpus_type=row.corpus_type, + corpus_type_description=row.corpus_type_description, + taxonomy={**row.taxonomy}, + text=corpus_text, + image_url=image_url, + ) + + +def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[CorpusConfig]: + query = db.query( + Corpus.import_id.label("corpus_import_id"), + Corpus.title.label("title"), + Corpus.description.label("description"), + Corpus.corpus_image_url.label("image_url"), + Corpus.corpus_text.label("text"), + Corpus.corpus_type_name.label("corpus_type"), + CorpusType.description.label("corpus_type_description"), + CorpusType.valid_metadata.label("taxonomy"), + ).join( + Corpus, + Corpus.corpus_type_name == CorpusType.name, + ) + if allowed_corpora != []: + query = query.filter(Corpus.import_id.in_(allowed_corpora)) + + corpora = query.all() + + return [_to_corpus_data(row) for row in corpora] diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 38d0809f..936ed034 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -8,6 +8,7 @@ from sqlalchemy.orm import Session from app.models.metadata import ApplicationConfig +from app.repository.corpus import get_allowed_corpora from app.repository.organisation import get_organisation_config, get_organisations from app.service.pipeline import IMPORT_ID_MATCHER from app.service.util import tree_table_to_json @@ -28,6 +29,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], + corpora=get_allowed_corpora(db, allowed_corpora), ) diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 65fc56ce..1a576750 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -76,25 +76,23 @@ def _add_family(test_db, import_id: str, cat: FamilyCategory, corpus_import_id): ) -def test_config_endpoint_content(data_client, data_db, valid_token): +def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_token): """Tests whether we get the expected content when the /config endpoint is called.""" # TODO: this test is fragile, we should look into validation according to the # supporting data, rather than counts & fixed lists url_under_test = "/api/v1/config" + app_token = app_token_factory( + "CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000" + ) - response = data_client.get(url_under_test, headers={"app-token": valid_token}) + response = data_client.get(url_under_test, headers={"app-token": app_token}) response_json = response.json() assert response.status_code == OK assert ( set(response_json.keys()) - ^ { - "geographies", - "organisations", - "document_variants", - "languages", - } + ^ {"geographies", "organisations", "document_variants", "languages", "corpora"} == set() ) @@ -111,9 +109,43 @@ def test_config_endpoint_content(data_client, data_db, valid_token): assert len(response_json["document_variants"]) == 2 assert "Original Language" in response_json["document_variants"] - # Now test organisations - assert "organisations" in response_json + corpora = response_json["corpora"] + assert len(corpora) == 2 + + assert corpora[0]["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" + assert corpora[0]["corpus_type"] == "Laws and Policies" + assert ( + corpora[0]["image_url"] + == "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png" + ) + assert "Grantham Research Institute" in corpora[0]["text"] + assert corpora[0]["corpus_type_description"] == "Laws and policies" + assert corpora[0]["description"] == "CCLW national policies" + assert corpora[0]["title"] == "CCLW national policies" + assert set(corpora[0]["taxonomy"]) ^ EXPECTED_CCLW_TAXONOMY == set() + + # Check document roles. + assert "role" in corpora[0]["taxonomy"]["_document"].keys() + assert len(corpora[0]["taxonomy"]["_document"]["role"]["allowed_values"]) == 10 + assert "MAIN" in corpora[0]["taxonomy"]["_document"]["role"]["allowed_values"] + # Check document roles. + assert "type" in corpora[0]["taxonomy"]["_document"].keys() + assert len(corpora[0]["taxonomy"]["_document"]["type"]["allowed_values"]) == 76 + assert ( + "Adaptation Communication" + in corpora[0]["taxonomy"]["_document"]["type"]["allowed_values"] + ) + + # Check event types. + assert len(corpora[0]["taxonomy"]["_event"]["event_type"]["allowed_values"]) == 17 + assert ( + "Passed/Approved" + in corpora[0]["taxonomy"]["_event"]["event_type"]["allowed_values"] + ) + + # Below to be removed as part of PDCT-1759 + # Now test organisations assert "CCLW" in response_json["organisations"] cclw_org = response_json["organisations"]["CCLW"] assert len(cclw_org) == LEN_ORG_CONFIG From 205d62f6fa13785410c7d7e5481827e3dc0c9bfb Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Tue, 17 Dec 2024 15:11:49 +0000 Subject: [PATCH 02/13] Add org info to the new corpora key in /config response --- app/models/metadata.py | 2 ++ app/repository/corpus.py | 34 ++++++++++++------- .../non_search/routers/lookups/test_config.py | 2 ++ 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/app/models/metadata.py b/app/models/metadata.py index ea4db29d..b36fa5a2 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -37,6 +37,8 @@ class CorpusConfig(BaseModel): taxonomy: TaxonomyData text: str image_url: str + organisation_name: str + organisation_id: int class ApplicationConfig(BaseModel): diff --git a/app/repository/corpus.py b/app/repository/corpus.py index 8b0d5a36..667e91ff 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -1,5 +1,5 @@ from db_client.models.dfce.family import Corpus -from db_client.models.organisation import CorpusType +from db_client.models.organisation import CorpusType, Organisation from sqlalchemy.orm import Session from app import config @@ -22,22 +22,30 @@ def _to_corpus_data(row) -> CorpusConfig: taxonomy={**row.taxonomy}, text=corpus_text, image_url=image_url, + organisation_id=row.organisation_id, + organisation_name=row.organisation_name, ) def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[CorpusConfig]: - query = db.query( - Corpus.import_id.label("corpus_import_id"), - Corpus.title.label("title"), - Corpus.description.label("description"), - Corpus.corpus_image_url.label("image_url"), - Corpus.corpus_text.label("text"), - Corpus.corpus_type_name.label("corpus_type"), - CorpusType.description.label("corpus_type_description"), - CorpusType.valid_metadata.label("taxonomy"), - ).join( - Corpus, - Corpus.corpus_type_name == CorpusType.name, + query = ( + db.query( + Corpus.import_id.label("corpus_import_id"), + Corpus.title.label("title"), + Corpus.description.label("description"), + Corpus.corpus_image_url.label("image_url"), + Corpus.corpus_text.label("text"), + Corpus.corpus_type_name.label("corpus_type"), + CorpusType.description.label("corpus_type_description"), + CorpusType.valid_metadata.label("taxonomy"), + Organisation.id.label("organisation_id"), + Organisation.name.label("organisation_name"), + ) + .join( + CorpusType, + Corpus.corpus_type_name == CorpusType.name, + ) + .join(Organisation, Corpus.organisation_id == Organisation.id) ) if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 1a576750..8c112b84 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -114,6 +114,8 @@ def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_ assert corpora[0]["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" assert corpora[0]["corpus_type"] == "Laws and Policies" + assert corpora[0]["organisation_name"] == "CCLW" + assert corpora[0]["organisation_id"] == 1 assert ( corpora[0]["image_url"] == "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png" From 0357c63829a4e066e603633a5c933b2f078bd13f Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Tue, 17 Dec 2024 17:37:46 +0000 Subject: [PATCH 03/13] Add family stats to new corpora key in /config response --- app/models/metadata.py | 10 +++- app/repository/corpus.py | 48 +++++++++++++++-- app/repository/lookups.py | 4 +- .../non_search/routers/lookups/test_config.py | 54 +++++++++++++++++-- 4 files changed, 104 insertions(+), 12 deletions(-) diff --git a/app/models/metadata.py b/app/models/metadata.py index b36fa5a2..520d422d 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -29,16 +29,22 @@ class CorpusConfig(BaseModel): """Contains the Corpus, Organisation and CorpusType info as well as stats used on homepage""" + # From corpus corpus_import_id: str title: str description: str + image_url: str + text: str + # From corpus_type corpus_type: str corpus_type_description: str taxonomy: TaxonomyData - text: str - image_url: str + # From organisation organisation_name: str organisation_id: int + # No of families in corpus + total: int + count_by_category: Mapping[str, int] class ApplicationConfig(BaseModel): diff --git a/app/repository/corpus.py b/app/repository/corpus.py index 667e91ff..af3714c4 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -1,12 +1,43 @@ -from db_client.models.dfce.family import Corpus +from typing import Any + +from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus from db_client.models.organisation import CorpusType, Organisation +from sqlalchemy import func from sqlalchemy.orm import Session from app import config from app.models.metadata import CorpusConfig -def _to_corpus_data(row) -> CorpusConfig: +def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: + total = ( + db.query(Family) + .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) + .filter(FamilyCorpus.corpus_import_id == corpus_import_id) + .count() + ) + + counts = ( + db.query(Family.family_category, func.count()) + .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) + .filter(FamilyCorpus.corpus_import_id == corpus_import_id) + .group_by(Family.family_category) + .all() + ) + found_categories = {c[0].value: c[1] for c in counts} + count_by_category = {} + + # Supply zeros when there aren't any + for category in [e.value for e in FamilyCategory]: + if category in found_categories.keys(): + count_by_category[category] = found_categories[category] + else: + count_by_category[category] = 0 + + return {"total": total, "count_by_category": count_by_category} + + +def _to_corpus_config(row, stats: dict[str, Any]) -> CorpusConfig: image_url = ( f"https://{config.CDN_DOMAIN}/{row.image_url}" if row.image_url is not None and len(row.image_url) > 0 @@ -24,10 +55,14 @@ def _to_corpus_data(row) -> CorpusConfig: image_url=image_url, organisation_id=row.organisation_id, organisation_name=row.organisation_name, + total=stats["total"], + count_by_category=stats["count_by_category"], ) -def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[CorpusConfig]: +def get_config_for_allowed_corpora( + db: Session, allowed_corpora: list[str] +) -> list[CorpusConfig]: query = ( db.query( Corpus.import_id.label("corpus_import_id"), @@ -52,4 +87,9 @@ def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[CorpusC corpora = query.all() - return [_to_corpus_data(row) for row in corpora] + return [ + _to_corpus_config( + row, _get_family_stats_per_corpus(db=db, corpus_import_id=row[0]) + ) + for row in corpora + ] diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 936ed034..2037cd54 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -8,7 +8,7 @@ from sqlalchemy.orm import Session from app.models.metadata import ApplicationConfig -from app.repository.corpus import get_allowed_corpora +from app.repository.corpus import get_config_for_allowed_corpora from app.repository.organisation import get_organisation_config, get_organisations from app.service.pipeline import IMPORT_ID_MATCHER from app.service.util import tree_table_to_json @@ -29,7 +29,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], - corpora=get_allowed_corpora(db, allowed_corpora), + corpora=get_config_for_allowed_corpora(db, allowed_corpora), ) diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 8c112b84..c9ffcab6 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -112,6 +112,13 @@ def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_ corpora = response_json["corpora"] assert len(corpora) == 2 + assert corpora[0]["count_by_category"] == { + "Executive": 0, + "Legislative": 0, + "UNFCCC": 0, + "MCF": 0, + } + assert corpora[0]["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" assert corpora[0]["corpus_type"] == "Laws and Policies" assert corpora[0]["organisation_name"] == "CCLW" @@ -232,6 +239,21 @@ def test_config_endpoint_cclw_stats(data_client, data_db, valid_token): response_json = response.json() + corpora = response_json["corpora"] + assert len(corpora) == 2 + cclw_corpus_config = next( + (corpus for corpus in corpora if "CCLW" in corpus["corpus_import_id"]), {} + ) + laws = cclw_corpus_config["count_by_category"]["Legislative"] + policies = cclw_corpus_config["count_by_category"]["Executive"] + unfccc = cclw_corpus_config["count_by_category"]["UNFCCC"] + assert laws == 2 + assert policies == 3 + assert unfccc == 1 + + assert cclw_corpus_config["total"] == laws + policies + unfccc + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"]["CCLW"] assert len(org_config) == LEN_ORG_CONFIG assert org_config["total"] == 6 @@ -293,6 +315,17 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only( response_json = response.json() + assert len(response_json["corpora"]) == 1 + corpus_config = response_json["corpora"][0] + assert corpus_config["total"] == 1 + assert corpus_config["count_by_category"] == { + "Executive": 0, + "Legislative": 1, + "MCF": 0, + "UNFCCC": 0, + } + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"] expected_org_config = { expected_organisation: { @@ -319,7 +352,7 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only( "MCF": 0, "UNFCCC": 0, }, - } + }, } assert org_config == expected_org_config @@ -357,20 +390,33 @@ def test_config_endpoint_returns_stats_for_all_orgs_if_no_allowed_corpora_in_app ) _add_family(data_db, "T.0.0.1", FamilyCategory.EXECUTIVE, cclw_corpus.import_id) - _add_family(data_db, "T.0.0.2", FamilyCategory.LEGISLATIVE, unfccc_corpus.import_id) + _add_family(data_db, "T.0.0.2", FamilyCategory.EXECUTIVE, unfccc_corpus.import_id) data_db.flush() response = data_client.get(url_under_test, headers={"app-token": app_token}) response_json = response.json() + + assert len(response_json["corpora"]) == 2 + corpora = response_json["corpora"] + for corpus in corpora: + assert corpus["total"] == 1 + assert corpus["count_by_category"] == { + "Executive": 1, + "Legislative": 0, + "MCF": 0, + "UNFCCC": 0, + } + + # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"] assert list(org_config.keys()) == ["CCLW", "UNFCCC"] assert org_config["CCLW"]["total"] == 1 assert org_config["UNFCCC"]["total"] == 1 assert org_config["UNFCCC"]["count_by_category"] == { - "Executive": 0, - "Legislative": 1, + "Executive": 1, + "Legislative": 0, "MCF": 0, "UNFCCC": 0, } From 548b84438b1538beb57f3de374600c804302b9a6 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Wed, 18 Dec 2024 11:15:20 +0000 Subject: [PATCH 04/13] Bump minor version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c0cfb6e2..3351683c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.21" +version = "1.20.0" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] From 235975c6842d53059a0e57e85dc4c6c399490753 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Wed, 18 Dec 2024 15:46:37 +0000 Subject: [PATCH 05/13] Update the /config response from corpora -> corpus_types --- app/models/metadata.py | 17 +++-- app/repository/corpus.py | 59 ++++++++------ app/repository/lookups.py | 2 +- .../non_search/routers/lookups/test_config.py | 76 +++++++++---------- 4 files changed, 84 insertions(+), 70 deletions(-) diff --git a/app/models/metadata.py b/app/models/metadata.py index 520d422d..157557ce 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -26,8 +26,7 @@ class OrganisationConfig(BaseModel): class CorpusConfig(BaseModel): - """Contains the Corpus, Organisation and CorpusType info - as well as stats used on homepage""" + """Contains the Corpus and Organisation info as well as stats used on homepage""" # From corpus corpus_import_id: str @@ -35,10 +34,6 @@ class CorpusConfig(BaseModel): description: str image_url: str text: str - # From corpus_type - corpus_type: str - corpus_type_description: str - taxonomy: TaxonomyData # From organisation organisation_name: str organisation_id: int @@ -47,6 +42,14 @@ class CorpusConfig(BaseModel): count_by_category: Mapping[str, int] +class CorpusTypeConfig(BaseModel): + """Contains the CorpusType info as well as data of any corpora of that type""" + + corpus_type_description: str + taxonomy: TaxonomyData + corpora: Sequence[CorpusConfig] + + class ApplicationConfig(BaseModel): """Definition of the new Config which just includes taxonomy.""" @@ -54,4 +57,4 @@ class ApplicationConfig(BaseModel): organisations: Mapping[str, OrganisationConfig] languages: Mapping[str, str] document_variants: Sequence[str] - corpora: Sequence[CorpusConfig] + corpus_types: Mapping[str, CorpusTypeConfig] diff --git a/app/repository/corpus.py b/app/repository/corpus.py index af3714c4..771dd2fe 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Mapping from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus from db_client.models.organisation import CorpusType, Organisation @@ -6,7 +6,7 @@ from sqlalchemy.orm import Session from app import config -from app.models.metadata import CorpusConfig +from app.models.metadata import CorpusConfig, CorpusTypeConfig def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: @@ -37,32 +37,42 @@ def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str return {"total": total, "count_by_category": count_by_category} -def _to_corpus_config(row, stats: dict[str, Any]) -> CorpusConfig: +def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeConfig]: image_url = ( f"https://{config.CDN_DOMAIN}/{row.image_url}" if row.image_url is not None and len(row.image_url) > 0 else "" ) corpus_text = row.text if row.text is not None else "" - return CorpusConfig( - corpus_import_id=row.corpus_import_id, - title=row.title, - description=row.description, - corpus_type=row.corpus_type, - corpus_type_description=row.corpus_type_description, - taxonomy={**row.taxonomy}, - text=corpus_text, - image_url=image_url, - organisation_id=row.organisation_id, - organisation_name=row.organisation_name, - total=stats["total"], - count_by_category=stats["count_by_category"], - ) + return { + row.corpus_type: CorpusTypeConfig( + corpus_type_description=row.corpus_type_description, + taxonomy={**row.taxonomy}, + corpora=[ + CorpusConfig( + title=row.title, + description=row.description, + corpus_import_id=row.corpus_import_id, + text=corpus_text, + image_url=image_url, + organisation_id=row.organisation_id, + organisation_name=row.organisation_name, + total=stats["total"], + count_by_category=stats["count_by_category"], + ) + ], + ) + } + + +def _get_config_for_corpus(db: Session, row) -> dict[str, CorpusTypeConfig]: + stats = _get_family_stats_per_corpus(db, row.corpus_import_id) + return _to_corpus_type_config(row, stats) def get_config_for_allowed_corpora( db: Session, allowed_corpora: list[str] -) -> list[CorpusConfig]: +) -> Mapping[str, CorpusTypeConfig]: query = ( db.query( Corpus.import_id.label("corpus_import_id"), @@ -86,10 +96,11 @@ def get_config_for_allowed_corpora( query = query.filter(Corpus.import_id.in_(allowed_corpora)) corpora = query.all() + configs_for_each_allowed_corpus = ( + _get_config_for_corpus(db, row) for row in corpora + ) + config_for_allowed_corpora = { + k: v for d in configs_for_each_allowed_corpus for k, v in d.items() + } - return [ - _to_corpus_config( - row, _get_family_stats_per_corpus(db=db, corpus_import_id=row[0]) - ) - for row in corpora - ] + return config_for_allowed_corpora diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 2037cd54..5431311e 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -29,7 +29,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], - corpora=get_config_for_allowed_corpora(db, allowed_corpora), + corpus_types=get_config_for_allowed_corpora(db, allowed_corpora), ) diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index c9ffcab6..8bd68376 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -90,11 +90,13 @@ def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_ response_json = response.json() assert response.status_code == OK - assert ( - set(response_json.keys()) - ^ {"geographies", "organisations", "document_variants", "languages", "corpora"} - == set() - ) + assert set(response_json.keys()) == { + "geographies", + "organisations", + "document_variants", + "languages", + "corpus_types", + } assert "geographies" in response_json assert len(response_json["geographies"]) == 8 @@ -109,49 +111,47 @@ def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_ assert len(response_json["document_variants"]) == 2 assert "Original Language" in response_json["document_variants"] - corpora = response_json["corpora"] - assert len(corpora) == 2 + corpus_types = response_json["corpus_types"] + assert list(corpus_types.keys()) == ["Laws and Policies", "Intl. agreements"] + + laws_and_policies = corpus_types["Laws and Policies"] + assert laws_and_policies["corpus_type_description"] == "Laws and policies" - assert corpora[0]["count_by_category"] == { + taxonomy = laws_and_policies["taxonomy"] + assert set(taxonomy) ^ EXPECTED_CCLW_TAXONOMY == set() + # Check document roles. + assert "role" in taxonomy["_document"].keys() + assert len(taxonomy["_document"]["role"]["allowed_values"]) == 10 + assert "MAIN" in taxonomy["_document"]["role"]["allowed_values"] + # Check document roles. + assert "type" in taxonomy["_document"].keys() + assert len(taxonomy["_document"]["type"]["allowed_values"]) == 76 + assert "Adaptation Communication" in taxonomy["_document"]["type"]["allowed_values"] + # Check event types. + assert len(taxonomy["_event"]["event_type"]["allowed_values"]) == 17 + assert "Passed/Approved" in taxonomy["_event"]["event_type"]["allowed_values"] + + assert len(laws_and_policies["corpora"]) == 1 + cclw_corpus = laws_and_policies["corpora"][0] + + assert cclw_corpus["total"] == 0 + assert cclw_corpus["count_by_category"] == { "Executive": 0, "Legislative": 0, "UNFCCC": 0, "MCF": 0, } - assert corpora[0]["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" - assert corpora[0]["corpus_type"] == "Laws and Policies" - assert corpora[0]["organisation_name"] == "CCLW" - assert corpora[0]["organisation_id"] == 1 + assert cclw_corpus["corpus_import_id"] == "CCLW.corpus.i00000001.n0000" + assert cclw_corpus["organisation_name"] == "CCLW" + assert cclw_corpus["organisation_id"] == 1 assert ( - corpora[0]["image_url"] + cclw_corpus["image_url"] == "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png" ) - assert "Grantham Research Institute" in corpora[0]["text"] - assert corpora[0]["corpus_type_description"] == "Laws and policies" - assert corpora[0]["description"] == "CCLW national policies" - assert corpora[0]["title"] == "CCLW national policies" - assert set(corpora[0]["taxonomy"]) ^ EXPECTED_CCLW_TAXONOMY == set() - - # Check document roles. - assert "role" in corpora[0]["taxonomy"]["_document"].keys() - assert len(corpora[0]["taxonomy"]["_document"]["role"]["allowed_values"]) == 10 - assert "MAIN" in corpora[0]["taxonomy"]["_document"]["role"]["allowed_values"] - - # Check document roles. - assert "type" in corpora[0]["taxonomy"]["_document"].keys() - assert len(corpora[0]["taxonomy"]["_document"]["type"]["allowed_values"]) == 76 - assert ( - "Adaptation Communication" - in corpora[0]["taxonomy"]["_document"]["type"]["allowed_values"] - ) - - # Check event types. - assert len(corpora[0]["taxonomy"]["_event"]["event_type"]["allowed_values"]) == 17 - assert ( - "Passed/Approved" - in corpora[0]["taxonomy"]["_event"]["event_type"]["allowed_values"] - ) + assert "Grantham Research Institute" in cclw_corpus["text"] + assert cclw_corpus["description"] == "CCLW national policies" + assert cclw_corpus["title"] == "CCLW national policies" # Below to be removed as part of PDCT-1759 # Now test organisations From 6ecf5846f23aaf50f7ec25bff3e4b43ea28fd268 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Wed, 18 Dec 2024 16:37:34 +0000 Subject: [PATCH 06/13] Fix remaining tests --- .../non_search/routers/lookups/test_config.py | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 8bd68376..9122e21b 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -239,11 +239,10 @@ def test_config_endpoint_cclw_stats(data_client, data_db, valid_token): response_json = response.json() - corpora = response_json["corpora"] - assert len(corpora) == 2 - cclw_corpus_config = next( - (corpus for corpus in corpora if "CCLW" in corpus["corpus_import_id"]), {} - ) + corpus_types = response_json["corpus_types"] + assert len(corpus_types) == 2 + + cclw_corpus_config = corpus_types["Laws and Policies"]["corpora"][0] laws = cclw_corpus_config["count_by_category"]["Legislative"] policies = cclw_corpus_config["count_by_category"]["Executive"] unfccc = cclw_corpus_config["count_by_category"]["UNFCCC"] @@ -315,10 +314,11 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only( response_json = response.json() - assert len(response_json["corpora"]) == 1 - corpus_config = response_json["corpora"][0] - assert corpus_config["total"] == 1 - assert corpus_config["count_by_category"] == { + assert len(response_json["corpus_types"]) == 1 + + corpus = response_json["corpus_types"][expected_corpus_type.name]["corpora"][0] + assert corpus["total"] == 1 + assert corpus["count_by_category"] == { "Executive": 0, "Legislative": 1, "MCF": 0, @@ -397,16 +397,18 @@ def test_config_endpoint_returns_stats_for_all_orgs_if_no_allowed_corpora_in_app response_json = response.json() - assert len(response_json["corpora"]) == 2 - corpora = response_json["corpora"] - for corpus in corpora: - assert corpus["total"] == 1 - assert corpus["count_by_category"] == { - "Executive": 1, - "Legislative": 0, - "MCF": 0, - "UNFCCC": 0, - } + assert len(response_json["corpus_types"]) == 2 + corpus_types = response_json["corpus_types"] + + for corpus_type in list(corpus_types.values()): + for corpus in corpus_type["corpora"]: + assert corpus["total"] == 1 + assert corpus["count_by_category"] == { + "Executive": 1, + "Legislative": 0, + "MCF": 0, + "UNFCCC": 0, + } # Below to be removed as part of PDCT-1759 org_config = response_json["organisations"] From 74d8c35ad8b2ab895d54d1a59e34110fc7bca1e4 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Thu, 19 Dec 2024 11:04:28 +0000 Subject: [PATCH 07/13] Rename metadata model to config and add corpus type name to the config object --- app/api/api_v1/routers/lookups/config.py | 2 +- app/models/{metadata.py => config.py} | 1 + app/repository/corpus.py | 4 +++- app/repository/lookups.py | 2 +- app/repository/organisation.py | 2 +- tests/non_search/routers/lookups/test_config.py | 1 + 6 files changed, 8 insertions(+), 4 deletions(-) rename app/models/{metadata.py => config.py} (98%) diff --git a/app/api/api_v1/routers/lookups/config.py b/app/api/api_v1/routers/lookups/config.py index 71981ca8..d2151839 100644 --- a/app/api/api_v1/routers/lookups/config.py +++ b/app/api/api_v1/routers/lookups/config.py @@ -4,7 +4,7 @@ from app.api.api_v1.routers.lookups.router import lookups_router from app.clients.db.session import get_db -from app.models.metadata import ApplicationConfig +from app.models.config import ApplicationConfig from app.repository.lookups import get_config from app.service.custom_app import AppTokenFactory diff --git a/app/models/metadata.py b/app/models/config.py similarity index 98% rename from app/models/metadata.py rename to app/models/config.py index 157557ce..2f8c2210 100644 --- a/app/models/metadata.py +++ b/app/models/config.py @@ -45,6 +45,7 @@ class CorpusConfig(BaseModel): class CorpusTypeConfig(BaseModel): """Contains the CorpusType info as well as data of any corpora of that type""" + corpus_type_name: str corpus_type_description: str taxonomy: TaxonomyData corpora: Sequence[CorpusConfig] diff --git a/app/repository/corpus.py b/app/repository/corpus.py index 771dd2fe..01c330e3 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import Session from app import config -from app.models.metadata import CorpusConfig, CorpusTypeConfig +from app.models.config import CorpusConfig, CorpusTypeConfig def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: @@ -46,6 +46,7 @@ def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeCo corpus_text = row.text if row.text is not None else "" return { row.corpus_type: CorpusTypeConfig( + corpus_type_name=row.corpus_type_name, corpus_type_description=row.corpus_type_description, taxonomy={**row.taxonomy}, corpora=[ @@ -81,6 +82,7 @@ def get_config_for_allowed_corpora( Corpus.corpus_image_url.label("image_url"), Corpus.corpus_text.label("text"), Corpus.corpus_type_name.label("corpus_type"), + CorpusType.name.label("corpus_type_name"), CorpusType.description.label("corpus_type_description"), CorpusType.valid_metadata.label("taxonomy"), Organisation.id.label("organisation_id"), diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 5431311e..2d77377d 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -7,7 +7,7 @@ from sqlalchemy.exc import MultipleResultsFound from sqlalchemy.orm import Session -from app.models.metadata import ApplicationConfig +from app.models.config import ApplicationConfig from app.repository.corpus import get_config_for_allowed_corpora from app.repository.organisation import get_organisation_config, get_organisations from app.service.pipeline import IMPORT_ID_MATCHER diff --git a/app/repository/organisation.py b/app/repository/organisation.py index f2947066..7e2d8480 100644 --- a/app/repository/organisation.py +++ b/app/repository/organisation.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import Session from app import config -from app.models.metadata import CorpusData, OrganisationConfig +from app.models.config import CorpusData, OrganisationConfig def _to_corpus_data(row) -> CorpusData: diff --git a/tests/non_search/routers/lookups/test_config.py b/tests/non_search/routers/lookups/test_config.py index 9122e21b..5e3f84a1 100644 --- a/tests/non_search/routers/lookups/test_config.py +++ b/tests/non_search/routers/lookups/test_config.py @@ -115,6 +115,7 @@ def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_ assert list(corpus_types.keys()) == ["Laws and Policies", "Intl. agreements"] laws_and_policies = corpus_types["Laws and Policies"] + assert laws_and_policies["corpus_type_name"] == "Laws and Policies" assert laws_and_policies["corpus_type_description"] == "Laws and policies" taxonomy = laws_and_policies["taxonomy"] From ed58a1bf6d2c964f697488796c13179565c0e7e7 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Thu, 19 Dec 2024 14:10:53 +0000 Subject: [PATCH 08/13] Refactor --- app/repository/corpus.py | 96 ++++------------------------------ app/repository/corpus_type.py | 6 +++ app/repository/lookups.py | 5 +- app/repository/organisation.py | 4 ++ app/service/config.py | 94 +++++++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+), 90 deletions(-) create mode 100644 app/repository/corpus_type.py create mode 100644 app/service/config.py diff --git a/app/repository/corpus.py b/app/repository/corpus.py index 01c330e3..b05d3a47 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -1,108 +1,30 @@ -from typing import Any, Mapping - -from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus -from db_client.models.organisation import CorpusType, Organisation +from db_client.models.dfce.family import Corpus, Family, FamilyCorpus from sqlalchemy import func from sqlalchemy.orm import Session -from app import config -from app.models.config import CorpusConfig, CorpusTypeConfig - -def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: - total = ( +def get_total_families_per_corpus(db: Session, corpus_import_id: str) -> int: + return ( db.query(Family) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) .filter(FamilyCorpus.corpus_import_id == corpus_import_id) .count() ) - counts = ( + +def get_family_count_by_category_per_corpus(db: Session, corpus_import_id: str): + return ( db.query(Family.family_category, func.count()) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) .filter(FamilyCorpus.corpus_import_id == corpus_import_id) .group_by(Family.family_category) .all() ) - found_categories = {c[0].value: c[1] for c in counts} - count_by_category = {} - - # Supply zeros when there aren't any - for category in [e.value for e in FamilyCategory]: - if category in found_categories.keys(): - count_by_category[category] = found_categories[category] - else: - count_by_category[category] = 0 - return {"total": total, "count_by_category": count_by_category} - - -def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeConfig]: - image_url = ( - f"https://{config.CDN_DOMAIN}/{row.image_url}" - if row.image_url is not None and len(row.image_url) > 0 - else "" - ) - corpus_text = row.text if row.text is not None else "" - return { - row.corpus_type: CorpusTypeConfig( - corpus_type_name=row.corpus_type_name, - corpus_type_description=row.corpus_type_description, - taxonomy={**row.taxonomy}, - corpora=[ - CorpusConfig( - title=row.title, - description=row.description, - corpus_import_id=row.corpus_import_id, - text=corpus_text, - image_url=image_url, - organisation_id=row.organisation_id, - organisation_name=row.organisation_name, - total=stats["total"], - count_by_category=stats["count_by_category"], - ) - ], - ) - } - -def _get_config_for_corpus(db: Session, row) -> dict[str, CorpusTypeConfig]: - stats = _get_family_stats_per_corpus(db, row.corpus_import_id) - return _to_corpus_type_config(row, stats) - - -def get_config_for_allowed_corpora( - db: Session, allowed_corpora: list[str] -) -> Mapping[str, CorpusTypeConfig]: - query = ( - db.query( - Corpus.import_id.label("corpus_import_id"), - Corpus.title.label("title"), - Corpus.description.label("description"), - Corpus.corpus_image_url.label("image_url"), - Corpus.corpus_text.label("text"), - Corpus.corpus_type_name.label("corpus_type"), - CorpusType.name.label("corpus_type_name"), - CorpusType.description.label("corpus_type_description"), - CorpusType.valid_metadata.label("taxonomy"), - Organisation.id.label("organisation_id"), - Organisation.name.label("organisation_name"), - ) - .join( - CorpusType, - Corpus.corpus_type_name == CorpusType.name, - ) - .join(Organisation, Corpus.organisation_id == Organisation.id) - ) +def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[Corpus]: + query = db.query(Corpus) if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) - corpora = query.all() - configs_for_each_allowed_corpus = ( - _get_config_for_corpus(db, row) for row in corpora - ) - config_for_allowed_corpora = { - k: v for d in configs_for_each_allowed_corpus for k, v in d.items() - } - - return config_for_allowed_corpora + return query.all() diff --git a/app/repository/corpus_type.py b/app/repository/corpus_type.py new file mode 100644 index 00000000..c4b52b77 --- /dev/null +++ b/app/repository/corpus_type.py @@ -0,0 +1,6 @@ +from db_client.models.organisation import CorpusType +from sqlalchemy.orm import Session + + +def get(db: Session, corpus_type_name: str) -> CorpusType: + return db.query(CorpusType).filter(CorpusType.name == corpus_type_name).one() diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 2d77377d..5b33afe2 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -8,8 +8,8 @@ from sqlalchemy.orm import Session from app.models.config import ApplicationConfig -from app.repository.corpus import get_config_for_allowed_corpora from app.repository.organisation import get_organisation_config, get_organisations +from app.service.config import get_corpus_type_config_for_allowed_corpora from app.service.pipeline import IMPORT_ID_MATCHER from app.service.util import tree_table_to_json @@ -17,7 +17,6 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: - # First get the CCLW stats return ApplicationConfig( geographies=tree_table_to_json(table=Geography, db=db), organisations={ @@ -29,7 +28,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], - corpus_types=get_config_for_allowed_corpora(db, allowed_corpora), + corpus_types=get_corpus_type_config_for_allowed_corpora(db, allowed_corpora), ) diff --git a/app/repository/organisation.py b/app/repository/organisation.py index 7e2d8480..ba45c1a5 100644 --- a/app/repository/organisation.py +++ b/app/repository/organisation.py @@ -94,3 +94,7 @@ def get_organisations(db: Session, allowed_corpora: list[str]) -> list[Organisat if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) return query.all() + + +def get(db: Session, org_id: int) -> Organisation: + return db.query(Organisation).filter(Organisation.id == org_id).one() diff --git a/app/service/config.py b/app/service/config.py new file mode 100644 index 00000000..1891aac4 --- /dev/null +++ b/app/service/config.py @@ -0,0 +1,94 @@ +from typing import Any, Mapping + +from db_client.models.dfce.family import FamilyCategory +from db_client.models.organisation import Corpus, CorpusType, Organisation +from sqlalchemy.orm import Session + +from app import config +from app.models.config import CorpusConfig, CorpusTypeConfig +from app.repository import corpus_type as corpus_type_repo +from app.repository import organisation as org_repo +from app.repository.corpus import ( + get_allowed_corpora, + get_family_count_by_category_per_corpus, + get_total_families_per_corpus, +) + + +def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: + total = get_total_families_per_corpus(db, corpus_import_id) + + counts = get_family_count_by_category_per_corpus(db, corpus_import_id) + found_categories = {c[0].value: c[1] for c in counts} + count_by_category = {} + + # Supply zeros when there aren't any + for category in [e.value for e in FamilyCategory]: + if category in found_categories.keys(): + count_by_category[category] = found_categories[category] + else: + count_by_category[category] = 0 + + return {"total": total, "count_by_category": count_by_category} + + +def _to_corpus_type_config( + corpus: Corpus, + corpus_type: CorpusType, + organisation: Organisation, + stats: dict[str, Any], +) -> CorpusTypeConfig: + image_url = ( + f"https://{config.CDN_DOMAIN}/{corpus.corpus_image_url}" + if corpus.corpus_image_url is not None and len(corpus.corpus_image_url) > 0 + else "" + ) + corpus_text = corpus.corpus_text if corpus.corpus_text is not None else "" + + return CorpusTypeConfig( + corpus_type_name=corpus_type.name, + corpus_type_description=corpus_type.description, + taxonomy={**corpus_type.valid_metadata}, + corpora=[ + CorpusConfig( + title=corpus.title, + description=corpus.description, + corpus_import_id=corpus.import_id, + text=corpus_text, + image_url=image_url, + organisation_id=organisation.id, + organisation_name=organisation.name, + total=stats["total"], + count_by_category=stats["count_by_category"], + ) + ], + ) + + +def _get_config_for_corpus_type( + db: Session, corpus: Corpus +) -> dict[str, CorpusTypeConfig]: + stats = _get_family_stats_per_corpus(db, corpus.import_id) + corpus_type = corpus_type_repo.get(db, corpus.corpus_type_name) + organisation = org_repo.get(db, corpus.organisation_id) + return { + corpus_type.name: _to_corpus_type_config( + corpus, corpus_type, organisation, stats + ) + } + + +def get_corpus_type_config_for_allowed_corpora( + db: Session, allowed_corpora: list[str] +) -> Mapping[str, CorpusTypeConfig]: + + corpora = get_allowed_corpora(db, allowed_corpora) + + configs_for_each_allowed_corpus = ( + _get_config_for_corpus_type(db, corpus) for corpus in corpora + ) + corpus_type_config_for_allowed_corpora = { + k: v for config in configs_for_each_allowed_corpus for k, v in config.items() + } + + return corpus_type_config_for_allowed_corpora From e21b9b20f7585540441fe10466c9fad7ff80c3e0 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Thu, 19 Dec 2024 17:35:10 +0000 Subject: [PATCH 09/13] Fix most of the pyright errors --- app/service/config.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/app/service/config.py b/app/service/config.py index 1891aac4..720aaf68 100644 --- a/app/service/config.py +++ b/app/service/config.py @@ -40,24 +40,23 @@ def _to_corpus_type_config( ) -> CorpusTypeConfig: image_url = ( f"https://{config.CDN_DOMAIN}/{corpus.corpus_image_url}" - if corpus.corpus_image_url is not None and len(corpus.corpus_image_url) > 0 + if corpus.corpus_image_url is not None and len(str(corpus.corpus_image_url)) > 0 else "" ) corpus_text = corpus.corpus_text if corpus.corpus_text is not None else "" - return CorpusTypeConfig( - corpus_type_name=corpus_type.name, - corpus_type_description=corpus_type.description, + corpus_type_name=str(corpus_type.name), + corpus_type_description=str(corpus_type.description), taxonomy={**corpus_type.valid_metadata}, corpora=[ CorpusConfig( - title=corpus.title, - description=corpus.description, - corpus_import_id=corpus.import_id, - text=corpus_text, + title=str(corpus.title), + description=str(corpus.description), + corpus_import_id=str(corpus.import_id), + text=str(corpus_text), image_url=image_url, - organisation_id=organisation.id, - organisation_name=organisation.name, + organisation_id=int(str(organisation.id)), + organisation_name=str(organisation.name), total=stats["total"], count_by_category=stats["count_by_category"], ) @@ -68,11 +67,11 @@ def _to_corpus_type_config( def _get_config_for_corpus_type( db: Session, corpus: Corpus ) -> dict[str, CorpusTypeConfig]: - stats = _get_family_stats_per_corpus(db, corpus.import_id) - corpus_type = corpus_type_repo.get(db, corpus.corpus_type_name) - organisation = org_repo.get(db, corpus.organisation_id) + stats = _get_family_stats_per_corpus(db, str(corpus.import_id)) + corpus_type = corpus_type_repo.get(db, str(corpus.corpus_type_name)) + organisation = org_repo.get(db, int(str(corpus.organisation_id))) return { - corpus_type.name: _to_corpus_type_config( + str(corpus_type.name): _to_corpus_type_config( corpus, corpus_type, organisation, stats ) } From acc201f37232f3f6c193047d01cfe9095e667d3c Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Mon, 6 Jan 2025 14:20:29 +0000 Subject: [PATCH 10/13] Resolve final lint error --- app/service/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/service/config.py b/app/service/config.py index 720aaf68..e4db7158 100644 --- a/app/service/config.py +++ b/app/service/config.py @@ -2,6 +2,7 @@ from db_client.models.dfce.family import FamilyCategory from db_client.models.organisation import Corpus, CorpusType, Organisation +from sqlalchemy import cast from sqlalchemy.orm import Session from app import config @@ -47,7 +48,7 @@ def _to_corpus_type_config( return CorpusTypeConfig( corpus_type_name=str(corpus_type.name), corpus_type_description=str(corpus_type.description), - taxonomy={**corpus_type.valid_metadata}, + taxonomy={**cast(corpus_type.valid_metadata)}, corpora=[ CorpusConfig( title=str(corpus.title), From 7ae77bea7a4efba4f35186502a31771189f2cf74 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Mon, 6 Jan 2025 14:30:59 +0000 Subject: [PATCH 11/13] Add docstrings --- app/repository/corpus.py | 21 +++++++++++++++++++++ app/service/config.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/app/repository/corpus.py b/app/repository/corpus.py index b05d3a47..4de0061f 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -4,6 +4,13 @@ def get_total_families_per_corpus(db: Session, corpus_import_id: str) -> int: + """ + Get the total number of families per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: The total number of families per corpus + """ return ( db.query(Family) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) @@ -13,6 +20,13 @@ def get_total_families_per_corpus(db: Session, corpus_import_id: str) -> int: def get_family_count_by_category_per_corpus(db: Session, corpus_import_id: str): + """ + Get the count of families by category per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: A list of tuples where each tuple contains a family category and its count + """ return ( db.query(Family.family_category, func.count()) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) @@ -23,6 +37,13 @@ def get_family_count_by_category_per_corpus(db: Session, corpus_import_id: str): def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[Corpus]: + """ + Get the allowed corpora. + + :param db: Database session + :param allowed_corpora: A list of allowed corpora + :return: A list of Corpus objects that are allowed + """ query = db.query(Corpus) if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) diff --git a/app/service/config.py b/app/service/config.py index e4db7158..516efffa 100644 --- a/app/service/config.py +++ b/app/service/config.py @@ -17,6 +17,13 @@ def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: + """ + Get family statistics per corpus. + + :param db: Database session + :param corpus_import_id: The import ID of the corpus + :return: A dictionary containing total families and count by category + """ total = get_total_families_per_corpus(db, corpus_import_id) counts = get_family_count_by_category_per_corpus(db, corpus_import_id) @@ -39,6 +46,15 @@ def _to_corpus_type_config( organisation: Organisation, stats: dict[str, Any], ) -> CorpusTypeConfig: + """ + Convert corpus, corpus type, organisation, and stats to CorpusTypeConfig. + + :param corpus: Corpus object + :param corpus_type: CorpusType object + :param organisation: Organisation object + :param stats: A dictionary containing statistics + :return: A CorpusTypeConfig object + """ image_url = ( f"https://{config.CDN_DOMAIN}/{corpus.corpus_image_url}" if corpus.corpus_image_url is not None and len(str(corpus.corpus_image_url)) > 0 @@ -48,7 +64,7 @@ def _to_corpus_type_config( return CorpusTypeConfig( corpus_type_name=str(corpus_type.name), corpus_type_description=str(corpus_type.description), - taxonomy={**cast(corpus_type.valid_metadata)}, + taxonomy={**cast(dict, corpus_type.valid_metadata)}, corpora=[ CorpusConfig( title=str(corpus.title), @@ -68,6 +84,13 @@ def _to_corpus_type_config( def _get_config_for_corpus_type( db: Session, corpus: Corpus ) -> dict[str, CorpusTypeConfig]: + """ + Get configuration for a corpus type. + + :param db: Database session + :param corpus: Corpus object + :return: A dictionary containing CorpusTypeConfig + """ stats = _get_family_stats_per_corpus(db, str(corpus.import_id)) corpus_type = corpus_type_repo.get(db, str(corpus.corpus_type_name)) organisation = org_repo.get(db, int(str(corpus.organisation_id))) @@ -81,7 +104,13 @@ def _get_config_for_corpus_type( def get_corpus_type_config_for_allowed_corpora( db: Session, allowed_corpora: list[str] ) -> Mapping[str, CorpusTypeConfig]: + """ + Get CorpusTypeConfig for allowed corpora. + :param db: Database session + :param allowed_corpora: A list of allowed corpora + :return: A mapping of CorpusTypeConfig for allowed corpora + """ corpora = get_allowed_corpora(db, allowed_corpora) configs_for_each_allowed_corpus = ( From 81948142e2cdadb8436c7d54fa55714fa5c83bef Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Mon, 6 Jan 2025 14:34:08 +0000 Subject: [PATCH 12/13] Add missing docstring --- app/repository/corpus_type.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/app/repository/corpus_type.py b/app/repository/corpus_type.py index c4b52b77..86ae648c 100644 --- a/app/repository/corpus_type.py +++ b/app/repository/corpus_type.py @@ -3,4 +3,11 @@ def get(db: Session, corpus_type_name: str) -> CorpusType: + """ + Get a CorpusType object based on its name. + + :param db: Database session + :param corpus_type_name: The name of the corpus type + :return: A CorpusType object + """ return db.query(CorpusType).filter(CorpusType.name == corpus_type_name).one() From 4df4149d9d9a38ac716ca5f6f7b374aa86d610a8 Mon Sep 17 00:00:00 2001 From: Anna Pokorska Date: Mon, 6 Jan 2025 14:40:15 +0000 Subject: [PATCH 13/13] Fix wrong import --- app/service/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/service/config.py b/app/service/config.py index 516efffa..f03e72be 100644 --- a/app/service/config.py +++ b/app/service/config.py @@ -1,8 +1,7 @@ -from typing import Any, Mapping +from typing import Any, Mapping, cast from db_client.models.dfce.family import FamilyCategory from db_client.models.organisation import Corpus, CorpusType, Organisation -from sqlalchemy import cast from sqlalchemy.orm import Session from app import config