Skip to content

Commit

Permalink
Feature/pdct 1661 world map should only show counts of families from …
Browse files Browse the repository at this point in the history
…allowed corpora (#423)

* Make documents router dependent on app token

* Driveby: Add CORS tests for MCF

* Update slug lookup query to respect allowed corpora

* Include actual CCLW corpus ID in test token

* Bump to 1.19.11

* Refactor _get_query_template

* Refactor doc and fam lookup tests

* Add integration tests for doc/fam lookup when corpora mismatch

* Add alternative corpora token

* Refactor download code

* Geo

* Merge in main

* Bump to 1.19.16

* Fix docstring param name

* Fix capitalisation

* Fix capitalisation for MCF category

* Fix world map query counts

* Update world map stats repo function

* Move get_world_map_stats to service layer

* Rename function and add world map service

* Import from world map service

* Update counts

* Fix counts

* Rename router world map

* PR comment fixes

* Make representative of entity
  • Loading branch information
katybaulch authored Nov 28, 2024
1 parent 9089898 commit 1e2917a
Show file tree
Hide file tree
Showing 13 changed files with 374 additions and 217 deletions.
34 changes: 0 additions & 34 deletions app/api/api_v1/routers/geographies.py

This file was deleted.

51 changes: 51 additions & 0 deletions app/api/api_v1/routers/world_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import logging
from typing import Annotated

from fastapi import APIRouter, Depends, Header, HTTPException, Request, status

from app.clients.db.session import get_db
from app.errors import RepositoryError, ValidationError
from app.models.geography import GeographyStatsDTO
from app.service.custom_app import AppTokenFactory
from app.service.world_map import get_world_map_stats

_LOGGER = logging.getLogger(__file__)

world_map_router = APIRouter()


@world_map_router.get("/geographies", response_model=list[GeographyStatsDTO])
async def world_map_stats(
request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
):
"""Get a summary of family counts for all geographies for world map."""
_LOGGER.info(
"Getting world map counts for all geographies",
extra={
"props": {"app_token": str(app_token)},
},
)

# Decode the app token and validate it.
token = AppTokenFactory()
token.decode_and_validate(db, request, app_token)

try:
world_map_stats = get_world_map_stats(db, token.allowed_corpora_ids)

if world_map_stats == []:
_LOGGER.error("No stats for world map found")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No stats for world map found",
)

return world_map_stats
except RepositoryError as e:
_LOGGER.error(e)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=e.message
)
except ValidationError as e:
_LOGGER.error(e)
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=e.message)
4 changes: 2 additions & 2 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from app.api.api_v1.routers.admin import admin_document_router
from app.api.api_v1.routers.auth import auth_router
from app.api.api_v1.routers.documents import documents_router
from app.api.api_v1.routers.geographies import geographies_router
from app.api.api_v1.routers.lookups import lookups_router
from app.api.api_v1.routers.pipeline_trigger import pipeline_trigger_router
from app.api.api_v1.routers.search import search_router
from app.api.api_v1.routers.summaries import summary_router
from app.api.api_v1.routers.world_map import world_map_router
from app.clients.db.session import SessionLocal, engine
from app.service.auth import get_superuser_details
from app.service.health import is_database_online
Expand Down Expand Up @@ -158,7 +158,7 @@ async def root():
summary_router, prefix="/api/v1", tags=["Summaries"], include_in_schema=False
)
app.include_router(
geographies_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
world_map_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
)

# add pagination support to all routes that ask for it
Expand Down
2 changes: 1 addition & 1 deletion app/repository/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_whole_database_dump(
"""Get whole database dump and bind variables.
:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] corpora_ids: The corpora from which we
:param list[str] allowed_corpora_ids: The corpora from which we
should allow the data to be dumped.
:return pd.DataFrame: A DataFrame containing the results of the SQL
query that gets the whole database dump in our desired format.
Expand Down
119 changes: 27 additions & 92 deletions app/repository/geography.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
"""Functions to support the geographies endpoint."""

import logging
import os
from typing import Optional, Sequence

from db_client.models.dfce.family import (
Family,
FamilyDocument,
FamilyGeography,
FamilyStatus,
)
from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography
from db_client.models.dfce.geography import Geography
from sqlalchemy import func
from sqlalchemy.exc import OperationalError
from sqlalchemy import bindparam, text
from sqlalchemy.orm import Query, Session
from sqlalchemy.types import ARRAY, String

from app.errors import RepositoryError
from app.errors import ValidationError
from app.models.geography import GeographyStatsDTO
from app.repository.helpers import get_query_template

_LOGGER = logging.getLogger(__file__)

Expand Down Expand Up @@ -63,74 +60,32 @@ def get_geo_subquery(
return geo_subquery.subquery("geo_subquery")


def _db_count_fams_in_category_and_geo(db: Session) -> Query:
"""
Query the database for the fam count per category per geo.
NOTE: SqlAlchemy will make a complete hash of query generation if
columns are used in the query() call. Therefore, entire objects are
returned.
def count_families_per_category_in_each_geo(
db: Session, allowed_corpora: list[str]
) -> list[GeographyStatsDTO]:
"""Query the database for the family count per category per geo.
:param Session db: DB Session to perform query on.
:return Query: A Query object containing the queries to perform.
:param list[str] allowed_corpora: The list of allowed corpora IDs to
filter on.
:return list[GeographyStatsDTO]: A list of counts of families by
category per geography.
"""
# Get the required Geography information and cross join each with all of the unique
# family_category values (so if some geographies have no documents for a particular
# family_category, we can set the count for that category to 0).
family_categories = db.query(Family.family_category).distinct().subquery()
geo_family_combinations = db.query(
Geography.id.label("geography_id"),
Geography.display_value,
Geography.slug,
Geography.value,
family_categories.c.family_category,
).subquery("geo_family_combinations")

# Get a count of documents in each present family_category for each geography.
counts = (
db.query(
Family.family_category,
FamilyGeography.geography_id,
func.count().label("records_count"),
)
.join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
.filter(Family.family_status == FamilyStatus.PUBLISHED)
.group_by(Family.family_category, FamilyGeography.geography_id)
.subquery("counts")
)
if allowed_corpora in [None, []]:
raise ValidationError("No allowed corpora provided")

# Aggregate family_category counts per geography into a JSONB object, and if a
# family_category count is missing, set the count for that category to 0 so each
# geography will always have a count for all family_category values.
query = (
db.query(
geo_family_combinations.c.display_value.label("display_value"),
geo_family_combinations.c.slug.label("slug"),
geo_family_combinations.c.value.label("value"),
func.jsonb_object_agg(
geo_family_combinations.c.family_category,
func.coalesce(counts.c.records_count, 0),
).label("counts"),
)
.select_from(
geo_family_combinations.join(
counts,
(geo_family_combinations.c.geography_id == counts.c.geography_id)
& (
geo_family_combinations.c.family_category
== counts.c.family_category
),
isouter=True,
)
)
.group_by(
geo_family_combinations.c.display_value,
geo_family_combinations.c.slug,
geo_family_combinations.c.value,
)
.order_by(geo_family_combinations.c.display_value)
query_template = text(
get_query_template(os.path.join("app", "repository", "sql", "world_map.sql"))
)
return query
query_template = query_template.bindparams(
bindparam("allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)),
)

family_geo_stats = db.execute(
query_template, {"allowed_corpora_ids": allowed_corpora}
).all()
results = [_to_dto(fgs) for fgs in family_geo_stats]
return results


def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
Expand All @@ -148,23 +103,3 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
slug=family_doc_geo_stats.slug,
family_counts=family_doc_geo_stats.counts,
)


def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]:
"""
Get a count of fam per category per geography for all geographies.
:param db Session: The database session.
:return list[GeographyStatsDTO]: A list of Geography stats objects
"""
try:
family_geo_stats = _db_count_fams_in_category_and_geo(db).all()
except OperationalError as e:
_LOGGER.error(e)
raise RepositoryError("Error querying the database for geography stats")

if not family_geo_stats:
return []

result = [_to_dto(fgs) for fgs in family_geo_stats]
return result
3 changes: 2 additions & 1 deletion app/repository/sql/download.sql
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ SELECT
n3.event_type_names AS "Full timeline of events (types)",
n3.event_dates AS "Full timeline of events (dates)",
d.created::DATE AS "Date Added to System",
f.last_modified::DATE AS "Last ModIFied on System",
f.last_modified::DATE AS "Last Modified on System",
d.import_id AS "Internal Document ID",
f.import_id AS "Internal Family ID",
n1.collection_import_ids AS "Internal Collection ID(s)",
Expand All @@ -237,6 +237,7 @@ SELECT
type,0}') AS "Document Type",
CASE
WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
WHEN f.family_category = 'MCF' THEN 'MCF'
ELSE INITCAP(f.family_category::TEXT)
END AS "Category",
ARRAY_TO_STRING(
Expand Down
93 changes: 93 additions & 0 deletions app/repository/sql/world_map.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
WITH counts AS (
SELECT
family.family_category,
family_geography.geography_id,
COUNT(*) AS records_count
FROM
family
INNER JOIN
family_corpus
ON family.import_id = family_corpus.family_import_id
INNER JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id
INNER JOIN
family_geography
ON family.import_id = family_geography.family_import_id
WHERE
family_corpus.corpus_import_id = ANY(:allowed_corpora_ids)
AND CASE
WHEN (
NOT (
EXISTS (
SELECT
1
FROM
family_document
WHERE
family.import_id = family_document.family_import_id
)
)
) THEN 'Created'
WHEN (
(
SELECT
COUNT(family_document.document_status) AS count_1
FROM
family_document
WHERE
family_document.family_import_id = family.import_id
AND family_document.document_status = 'PUBLISHED'
) > 0
) THEN 'Published'
WHEN (
(
SELECT
COUNT(family_document.document_status) AS count_2
FROM
family_document
WHERE
family_document.family_import_id = family.import_id
AND family_document.document_status = 'CREATED'
) > 0
) THEN 'Created'
ELSE 'Deleted'
END = 'Published'
GROUP BY
family.family_category,
family_geography.geography_id
)

SELECT
geo_family_combinations.display_value,
geo_family_combinations.slug,
geo_family_combinations.value,
JSONB_OBJECT_AGG(
geo_family_combinations.family_category,
COALESCE(counts.records_count, 0)
) AS counts
FROM
(
SELECT
geography.id AS geography_id,
geography.display_value,
geography.slug,
geography.value,
anon_1.family_category
FROM
geography,
(
SELECT DISTINCT
family.family_category
FROM
family
) AS anon_1
) AS geo_family_combinations
LEFT OUTER JOIN
counts
ON geo_family_combinations.geography_id = counts.geography_id
AND geo_family_combinations.family_category = counts.family_category
GROUP BY
geo_family_combinations.display_value,
geo_family_combinations.slug,
geo_family_combinations.value
ORDER BY
geo_family_combinations.display_value;
Loading

0 comments on commit 1e2917a

Please sign in to comment.