diff --git a/app/repository/corpus.py b/app/repository/corpus.py index 01c330e3..b05d3a47 100644 --- a/app/repository/corpus.py +++ b/app/repository/corpus.py @@ -1,108 +1,30 @@ -from typing import Any, Mapping - -from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus -from db_client.models.organisation import CorpusType, Organisation +from db_client.models.dfce.family import Corpus, Family, FamilyCorpus from sqlalchemy import func from sqlalchemy.orm import Session -from app import config -from app.models.config import CorpusConfig, CorpusTypeConfig - -def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: - total = ( +def get_total_families_per_corpus(db: Session, corpus_import_id: str) -> int: + return ( db.query(Family) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) .filter(FamilyCorpus.corpus_import_id == corpus_import_id) .count() ) - counts = ( + +def get_family_count_by_category_per_corpus(db: Session, corpus_import_id: str): + return ( db.query(Family.family_category, func.count()) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) .filter(FamilyCorpus.corpus_import_id == corpus_import_id) .group_by(Family.family_category) .all() ) - found_categories = {c[0].value: c[1] for c in counts} - count_by_category = {} - - # Supply zeros when there aren't any - for category in [e.value for e in FamilyCategory]: - if category in found_categories.keys(): - count_by_category[category] = found_categories[category] - else: - count_by_category[category] = 0 - return {"total": total, "count_by_category": count_by_category} - - -def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeConfig]: - image_url = ( - f"https://{config.CDN_DOMAIN}/{row.image_url}" - if row.image_url is not None and len(row.image_url) > 0 - else "" - ) - corpus_text = row.text if row.text is not None else "" - return { - row.corpus_type: CorpusTypeConfig( - corpus_type_name=row.corpus_type_name, - corpus_type_description=row.corpus_type_description, - taxonomy={**row.taxonomy}, - corpora=[ - CorpusConfig( - title=row.title, - description=row.description, - corpus_import_id=row.corpus_import_id, - text=corpus_text, - image_url=image_url, - organisation_id=row.organisation_id, - organisation_name=row.organisation_name, - total=stats["total"], - count_by_category=stats["count_by_category"], - ) - ], - ) - } - -def _get_config_for_corpus(db: Session, row) -> dict[str, CorpusTypeConfig]: - stats = _get_family_stats_per_corpus(db, row.corpus_import_id) - return _to_corpus_type_config(row, stats) - - -def get_config_for_allowed_corpora( - db: Session, allowed_corpora: list[str] -) -> Mapping[str, CorpusTypeConfig]: - query = ( - db.query( - Corpus.import_id.label("corpus_import_id"), - Corpus.title.label("title"), - Corpus.description.label("description"), - Corpus.corpus_image_url.label("image_url"), - Corpus.corpus_text.label("text"), - Corpus.corpus_type_name.label("corpus_type"), - CorpusType.name.label("corpus_type_name"), - CorpusType.description.label("corpus_type_description"), - CorpusType.valid_metadata.label("taxonomy"), - Organisation.id.label("organisation_id"), - Organisation.name.label("organisation_name"), - ) - .join( - CorpusType, - Corpus.corpus_type_name == CorpusType.name, - ) - .join(Organisation, Corpus.organisation_id == Organisation.id) - ) +def get_allowed_corpora(db: Session, allowed_corpora: list[str]) -> list[Corpus]: + query = db.query(Corpus) if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) - corpora = query.all() - configs_for_each_allowed_corpus = ( - _get_config_for_corpus(db, row) for row in corpora - ) - config_for_allowed_corpora = { - k: v for d in configs_for_each_allowed_corpus for k, v in d.items() - } - - return config_for_allowed_corpora + return query.all() diff --git a/app/repository/corpus_type.py b/app/repository/corpus_type.py new file mode 100644 index 00000000..c4b52b77 --- /dev/null +++ b/app/repository/corpus_type.py @@ -0,0 +1,6 @@ +from db_client.models.organisation import CorpusType +from sqlalchemy.orm import Session + + +def get(db: Session, corpus_type_name: str) -> CorpusType: + return db.query(CorpusType).filter(CorpusType.name == corpus_type_name).one() diff --git a/app/repository/lookups.py b/app/repository/lookups.py index 2d77377d..5b33afe2 100644 --- a/app/repository/lookups.py +++ b/app/repository/lookups.py @@ -8,8 +8,8 @@ from sqlalchemy.orm import Session from app.models.config import ApplicationConfig -from app.repository.corpus import get_config_for_allowed_corpora from app.repository.organisation import get_organisation_config, get_organisations +from app.service.config import get_corpus_type_config_for_allowed_corpora from app.service.pipeline import IMPORT_ID_MATCHER from app.service.util import tree_table_to_json @@ -17,7 +17,6 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: - # First get the CCLW stats return ApplicationConfig( geographies=tree_table_to_json(table=Geography, db=db), organisations={ @@ -29,7 +28,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig: variant.variant_name for variant in db.query(Variant).order_by(Variant.variant_name).all() ], - corpus_types=get_config_for_allowed_corpora(db, allowed_corpora), + corpus_types=get_corpus_type_config_for_allowed_corpora(db, allowed_corpora), ) diff --git a/app/repository/organisation.py b/app/repository/organisation.py index 7e2d8480..ba45c1a5 100644 --- a/app/repository/organisation.py +++ b/app/repository/organisation.py @@ -94,3 +94,7 @@ def get_organisations(db: Session, allowed_corpora: list[str]) -> list[Organisat if allowed_corpora != []: query = query.filter(Corpus.import_id.in_(allowed_corpora)) return query.all() + + +def get(db: Session, org_id: int) -> Organisation: + return db.query(Organisation).filter(Organisation.id == org_id).one() diff --git a/app/service/config.py b/app/service/config.py new file mode 100644 index 00000000..1891aac4 --- /dev/null +++ b/app/service/config.py @@ -0,0 +1,94 @@ +from typing import Any, Mapping + +from db_client.models.dfce.family import FamilyCategory +from db_client.models.organisation import Corpus, CorpusType, Organisation +from sqlalchemy.orm import Session + +from app import config +from app.models.config import CorpusConfig, CorpusTypeConfig +from app.repository import corpus_type as corpus_type_repo +from app.repository import organisation as org_repo +from app.repository.corpus import ( + get_allowed_corpora, + get_family_count_by_category_per_corpus, + get_total_families_per_corpus, +) + + +def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]: + total = get_total_families_per_corpus(db, corpus_import_id) + + counts = get_family_count_by_category_per_corpus(db, corpus_import_id) + found_categories = {c[0].value: c[1] for c in counts} + count_by_category = {} + + # Supply zeros when there aren't any + for category in [e.value for e in FamilyCategory]: + if category in found_categories.keys(): + count_by_category[category] = found_categories[category] + else: + count_by_category[category] = 0 + + return {"total": total, "count_by_category": count_by_category} + + +def _to_corpus_type_config( + corpus: Corpus, + corpus_type: CorpusType, + organisation: Organisation, + stats: dict[str, Any], +) -> CorpusTypeConfig: + image_url = ( + f"https://{config.CDN_DOMAIN}/{corpus.corpus_image_url}" + if corpus.corpus_image_url is not None and len(corpus.corpus_image_url) > 0 + else "" + ) + corpus_text = corpus.corpus_text if corpus.corpus_text is not None else "" + + return CorpusTypeConfig( + corpus_type_name=corpus_type.name, + corpus_type_description=corpus_type.description, + taxonomy={**corpus_type.valid_metadata}, + corpora=[ + CorpusConfig( + title=corpus.title, + description=corpus.description, + corpus_import_id=corpus.import_id, + text=corpus_text, + image_url=image_url, + organisation_id=organisation.id, + organisation_name=organisation.name, + total=stats["total"], + count_by_category=stats["count_by_category"], + ) + ], + ) + + +def _get_config_for_corpus_type( + db: Session, corpus: Corpus +) -> dict[str, CorpusTypeConfig]: + stats = _get_family_stats_per_corpus(db, corpus.import_id) + corpus_type = corpus_type_repo.get(db, corpus.corpus_type_name) + organisation = org_repo.get(db, corpus.organisation_id) + return { + corpus_type.name: _to_corpus_type_config( + corpus, corpus_type, organisation, stats + ) + } + + +def get_corpus_type_config_for_allowed_corpora( + db: Session, allowed_corpora: list[str] +) -> Mapping[str, CorpusTypeConfig]: + + corpora = get_allowed_corpora(db, allowed_corpora) + + configs_for_each_allowed_corpus = ( + _get_config_for_corpus_type(db, corpus) for corpus in corpora + ) + corpus_type_config_for_allowed_corpora = { + k: v for config in configs_for_each_allowed_corpus for k, v in config.items() + } + + return corpus_type_config_for_allowed_corpora