Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pdct 1759 make backend config endpoint focused on corpora not #430

Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/api/api_v1/routers/lookups/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from app.api.api_v1.routers.lookups.router import lookups_router
from app.clients.db.session import get_db
from app.models.metadata import ApplicationConfig
from app.models.config import ApplicationConfig
from app.repository.lookups import get_config
from app.service.custom_app import AppTokenFactory

Expand Down
27 changes: 27 additions & 0 deletions app/models/metadata.py → app/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,37 @@ class OrganisationConfig(BaseModel):
count_by_category: Mapping[str, int]


class CorpusConfig(BaseModel):
"""Contains the Corpus and Organisation info as well as stats used on homepage"""

# From corpus
corpus_import_id: str
title: str
description: str
image_url: str
text: str
# From organisation
organisation_name: str
organisation_id: int
# No of families in corpus
total: int
count_by_category: Mapping[str, int]


class CorpusTypeConfig(BaseModel):
"""Contains the CorpusType info as well as data of any corpora of that type"""

corpus_type_name: str
corpus_type_description: str
taxonomy: TaxonomyData
corpora: Sequence[CorpusConfig]


class ApplicationConfig(BaseModel):
"""Definition of the new Config which just includes taxonomy."""

geographies: Sequence[dict]
organisations: Mapping[str, OrganisationConfig]
languages: Mapping[str, str]
document_variants: Sequence[str]
corpus_types: Mapping[str, CorpusTypeConfig]
108 changes: 108 additions & 0 deletions app/repository/corpus.py
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Any, Mapping

from db_client.models.dfce.family import Corpus, Family, FamilyCategory, FamilyCorpus
from db_client.models.organisation import CorpusType, Organisation
from sqlalchemy import func
from sqlalchemy.orm import Session

from app import config
from app.models.config import CorpusConfig, CorpusTypeConfig


def _get_family_stats_per_corpus(db: Session, corpus_import_id: str) -> dict[str, Any]:
total = (
db.query(Family)
.join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id)
.filter(FamilyCorpus.corpus_import_id == corpus_import_id)
.count()
)

counts = (
db.query(Family.family_category, func.count())
.join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id)
.filter(FamilyCorpus.corpus_import_id == corpus_import_id)
.group_by(Family.family_category)
.all()
)
found_categories = {c[0].value: c[1] for c in counts}
count_by_category = {}

# Supply zeros when there aren't any
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
for category in [e.value for e in FamilyCategory]:
if category in found_categories.keys():
count_by_category[category] = found_categories[category]
else:
count_by_category[category] = 0

return {"total": total, "count_by_category": count_by_category}


def _to_corpus_type_config(row, stats: dict[str, Any]) -> dict[str, CorpusTypeConfig]:
image_url = (
f"https://{config.CDN_DOMAIN}/{row.image_url}"
if row.image_url is not None and len(row.image_url) > 0
else ""
)
corpus_text = row.text if row.text is not None else ""
return {
row.corpus_type: CorpusTypeConfig(
corpus_type_name=row.corpus_type_name,
corpus_type_description=row.corpus_type_description,
taxonomy={**row.taxonomy},
corpora=[
CorpusConfig(
title=row.title,
description=row.description,
corpus_import_id=row.corpus_import_id,
text=corpus_text,
image_url=image_url,
organisation_id=row.organisation_id,
organisation_name=row.organisation_name,
total=stats["total"],
count_by_category=stats["count_by_category"],
)
],
)
}


def _get_config_for_corpus(db: Session, row) -> dict[str, CorpusTypeConfig]:
stats = _get_family_stats_per_corpus(db, row.corpus_import_id)
return _to_corpus_type_config(row, stats)


def get_config_for_allowed_corpora(
db: Session, allowed_corpora: list[str]
) -> Mapping[str, CorpusTypeConfig]:
query = (
db.query(
Corpus.import_id.label("corpus_import_id"),
Corpus.title.label("title"),
Corpus.description.label("description"),
Corpus.corpus_image_url.label("image_url"),
Corpus.corpus_text.label("text"),
Corpus.corpus_type_name.label("corpus_type"),
CorpusType.name.label("corpus_type_name"),
CorpusType.description.label("corpus_type_description"),
CorpusType.valid_metadata.label("taxonomy"),
Organisation.id.label("organisation_id"),
Organisation.name.label("organisation_name"),
)
.join(
CorpusType,
Corpus.corpus_type_name == CorpusType.name,
)
.join(Organisation, Corpus.organisation_id == Organisation.id)
)
if allowed_corpora != []:
query = query.filter(Corpus.import_id.in_(allowed_corpora))

corpora = query.all()
configs_for_each_allowed_corpus = (
_get_config_for_corpus(db, row) for row in corpora
)
config_for_allowed_corpora = {
k: v for d in configs_for_each_allowed_corpus for k, v in d.items()
}

return config_for_allowed_corpora
4 changes: 3 additions & 1 deletion app/repository/lookups.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from sqlalchemy.exc import MultipleResultsFound
from sqlalchemy.orm import Session

from app.models.metadata import ApplicationConfig
from app.models.config import ApplicationConfig
from app.repository.corpus import get_config_for_allowed_corpora
from app.repository.organisation import get_organisation_config, get_organisations
from app.service.pipeline import IMPORT_ID_MATCHER
from app.service.util import tree_table_to_json
Expand All @@ -28,6 +29,7 @@ def get_config(db: Session, allowed_corpora: list[str]) -> ApplicationConfig:
variant.variant_name
for variant in db.query(Variant).order_by(Variant.variant_name).all()
],
corpus_types=get_config_for_allowed_corpora(db, allowed_corpora),
)


Expand Down
2 changes: 1 addition & 1 deletion app/repository/organisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sqlalchemy.orm import Session

from app import config
from app.models.metadata import CorpusData, OrganisationConfig
from app.models.config import CorpusData, OrganisationConfig


def _to_corpus_data(row) -> CorpusData:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "navigator_backend"
version = "1.19.21"
version = "1.20.0"
description = ""
authors = ["CPR-dev-team <[email protected]>"]
packages = [{ include = "app" }, { include = "tests" }]
Expand Down
119 changes: 101 additions & 18 deletions tests/non_search/routers/lookups/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,27 +76,27 @@ def _add_family(test_db, import_id: str, cat: FamilyCategory, corpus_import_id):
)


def test_config_endpoint_content(data_client, data_db, valid_token):
def test_config_endpoint_content(data_client, data_db, app_token_factory, valid_token):
"""Tests whether we get the expected content when the /config endpoint is called."""
# TODO: this test is fragile, we should look into validation according to the
# supporting data, rather than counts & fixed lists
url_under_test = "/api/v1/config"
app_token = app_token_factory(
"CCLW.corpus.i00000001.n0000,UNFCCC.corpus.i00000001.n0000"
)

response = data_client.get(url_under_test, headers={"app-token": valid_token})
response = data_client.get(url_under_test, headers={"app-token": app_token})

response_json = response.json()

assert response.status_code == OK
assert (
set(response_json.keys())
^ {
"geographies",
"organisations",
"document_variants",
"languages",
}
== set()
)
assert set(response_json.keys()) == {
"geographies",
"organisations",
"document_variants",
"languages",
"corpus_types",
}

assert "geographies" in response_json
assert len(response_json["geographies"]) == 8
Expand All @@ -111,9 +111,51 @@ def test_config_endpoint_content(data_client, data_db, valid_token):
assert len(response_json["document_variants"]) == 2
assert "Original Language" in response_json["document_variants"]

# Now test organisations
assert "organisations" in response_json
corpus_types = response_json["corpus_types"]
assert list(corpus_types.keys()) == ["Laws and Policies", "Intl. agreements"]

laws_and_policies = corpus_types["Laws and Policies"]
assert laws_and_policies["corpus_type_name"] == "Laws and Policies"
assert laws_and_policies["corpus_type_description"] == "Laws and policies"

taxonomy = laws_and_policies["taxonomy"]
assert set(taxonomy) ^ EXPECTED_CCLW_TAXONOMY == set()
# Check document roles.
assert "role" in taxonomy["_document"].keys()
assert len(taxonomy["_document"]["role"]["allowed_values"]) == 10
assert "MAIN" in taxonomy["_document"]["role"]["allowed_values"]
# Check document roles.
assert "type" in taxonomy["_document"].keys()
assert len(taxonomy["_document"]["type"]["allowed_values"]) == 76
assert "Adaptation Communication" in taxonomy["_document"]["type"]["allowed_values"]
# Check event types.
assert len(taxonomy["_event"]["event_type"]["allowed_values"]) == 17
assert "Passed/Approved" in taxonomy["_event"]["event_type"]["allowed_values"]

assert len(laws_and_policies["corpora"]) == 1
cclw_corpus = laws_and_policies["corpora"][0]

assert cclw_corpus["total"] == 0
assert cclw_corpus["count_by_category"] == {
"Executive": 0,
"Legislative": 0,
"UNFCCC": 0,
"MCF": 0,
}

assert cclw_corpus["corpus_import_id"] == "CCLW.corpus.i00000001.n0000"
assert cclw_corpus["organisation_name"] == "CCLW"
assert cclw_corpus["organisation_id"] == 1
assert (
cclw_corpus["image_url"]
== "https://cdn.climatepolicyradar.org/corpora/CCLW.corpus.i00000001.n0000/logo.png"
)
assert "Grantham Research Institute" in cclw_corpus["text"]
assert cclw_corpus["description"] == "CCLW national policies"
assert cclw_corpus["title"] == "CCLW national policies"

# Below to be removed as part of PDCT-1759
# Now test organisations
assert "CCLW" in response_json["organisations"]
cclw_org = response_json["organisations"]["CCLW"]
assert len(cclw_org) == LEN_ORG_CONFIG
Expand Down Expand Up @@ -198,6 +240,20 @@ def test_config_endpoint_cclw_stats(data_client, data_db, valid_token):

response_json = response.json()

corpus_types = response_json["corpus_types"]
assert len(corpus_types) == 2

cclw_corpus_config = corpus_types["Laws and Policies"]["corpora"][0]
laws = cclw_corpus_config["count_by_category"]["Legislative"]
policies = cclw_corpus_config["count_by_category"]["Executive"]
unfccc = cclw_corpus_config["count_by_category"]["UNFCCC"]
assert laws == 2
assert policies == 3
assert unfccc == 1

assert cclw_corpus_config["total"] == laws + policies + unfccc

# Below to be removed as part of PDCT-1759
org_config = response_json["organisations"]["CCLW"]
assert len(org_config) == LEN_ORG_CONFIG
assert org_config["total"] == 6
Expand Down Expand Up @@ -259,6 +315,18 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only(

response_json = response.json()

assert len(response_json["corpus_types"]) == 1

corpus = response_json["corpus_types"][expected_corpus_type.name]["corpora"][0]
assert corpus["total"] == 1
assert corpus["count_by_category"] == {
"Executive": 0,
"Legislative": 1,
"MCF": 0,
"UNFCCC": 0,
}

# Below to be removed as part of PDCT-1759
org_config = response_json["organisations"]
expected_org_config = {
expected_organisation: {
Expand All @@ -285,7 +353,7 @@ def test_config_endpoint_returns_stats_for_allowed_corpora_only(
"MCF": 0,
"UNFCCC": 0,
},
}
},
}
assert org_config == expected_org_config

Expand Down Expand Up @@ -323,20 +391,35 @@ def test_config_endpoint_returns_stats_for_all_orgs_if_no_allowed_corpora_in_app
)

_add_family(data_db, "T.0.0.1", FamilyCategory.EXECUTIVE, cclw_corpus.import_id)
_add_family(data_db, "T.0.0.2", FamilyCategory.LEGISLATIVE, unfccc_corpus.import_id)
_add_family(data_db, "T.0.0.2", FamilyCategory.EXECUTIVE, unfccc_corpus.import_id)
data_db.flush()

response = data_client.get(url_under_test, headers={"app-token": app_token})

response_json = response.json()

assert len(response_json["corpus_types"]) == 2
corpus_types = response_json["corpus_types"]

for corpus_type in list(corpus_types.values()):
for corpus in corpus_type["corpora"]:
assert corpus["total"] == 1
assert corpus["count_by_category"] == {
"Executive": 1,
"Legislative": 0,
"MCF": 0,
"UNFCCC": 0,
}

# Below to be removed as part of PDCT-1759
org_config = response_json["organisations"]

assert list(org_config.keys()) == ["CCLW", "UNFCCC"]
assert org_config["CCLW"]["total"] == 1
assert org_config["UNFCCC"]["total"] == 1
assert org_config["UNFCCC"]["count_by_category"] == {
"Executive": 0,
"Legislative": 1,
"Executive": 1,
"Legislative": 0,
"MCF": 0,
"UNFCCC": 0,
}
Expand Down
Loading