Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pdct 1661 world map should only show counts of families from allowed corpora #423

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
277e674
Make documents router dependent on app token
katybaulch Nov 4, 2024
0539d74
Driveby: Add CORS tests for MCF
katybaulch Nov 5, 2024
4a164d8
Update slug lookup query to respect allowed corpora
katybaulch Nov 5, 2024
b43aabf
Include actual CCLW corpus ID in test token
katybaulch Nov 5, 2024
6abc4ae
Merge branch 'main' into feature/pdct-1533-documents-should-only-show…
katybaulch Nov 5, 2024
e2e2d5c
Bump to 1.19.11
katybaulch Nov 5, 2024
45fa716
Refactor _get_query_template
katybaulch Nov 5, 2024
6010fce
Refactor doc and fam lookup tests
katybaulch Nov 5, 2024
7fee18f
Add integration tests for doc/fam lookup when corpora mismatch
katybaulch Nov 5, 2024
2a830db
Add alternative corpora token
katybaulch Nov 5, 2024
024d60b
Refactor download code
katybaulch Nov 5, 2024
9bcaff2
Merge branch 'main' into feature/pdct-1661-geo-should-only-show-docum…
katybaulch Nov 6, 2024
c4a50b5
Geo
katybaulch Nov 25, 2024
a050362
Merge branch 'main' into feature/pdct-1661-geo-should-only-show-docum…
katybaulch Nov 27, 2024
8480098
Merge in main
katybaulch Nov 28, 2024
a941cc1
Bump to 1.19.16
katybaulch Nov 28, 2024
7dbf911
Fix docstring param name
katybaulch Nov 28, 2024
b53e8a4
Fix capitalisation
katybaulch Nov 28, 2024
a9bc84e
Fix capitalisation for MCF category
katybaulch Nov 28, 2024
6ae9ee9
Fix world map query counts
katybaulch Nov 28, 2024
10b9dec
Update world map stats repo function
katybaulch Nov 28, 2024
b2030b4
Move get_world_map_stats to service layer
katybaulch Nov 28, 2024
e473a43
Rename function and add world map service
katybaulch Nov 28, 2024
83dc43e
Import from world map service
katybaulch Nov 28, 2024
ab8c701
Update counts
katybaulch Nov 28, 2024
9078770
Fix counts
katybaulch Nov 28, 2024
b2a3671
Rename router world map
katybaulch Nov 28, 2024
ce653c4
Merge branch 'main' into feature/pdct-1661-geo-should-only-show-docum…
katybaulch Nov 28, 2024
1df2e9e
PR comment fixes
katybaulch Nov 28, 2024
eeeb7e1
Make representative of entity
katybaulch Nov 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 0 additions & 34 deletions app/api/api_v1/routers/geographies.py

This file was deleted.

51 changes: 51 additions & 0 deletions app/api/api_v1/routers/world_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import logging
from typing import Annotated

from fastapi import APIRouter, Depends, Header, HTTPException, Request, status

from app.clients.db.session import get_db
from app.errors import RepositoryError, ValidationError
from app.models.geography import GeographyStatsDTO
from app.service.custom_app import AppTokenFactory
from app.service.world_map import get_world_map_stats

_LOGGER = logging.getLogger(__file__)

world_map_router = APIRouter()


@world_map_router.get("/geographies", response_model=list[GeographyStatsDTO])
async def world_map_stats(
request: Request, app_token: Annotated[str, Header()], db=Depends(get_db)
):
"""Get a summary of family counts for all geographies for world map."""
_LOGGER.info(
"Getting world map counts for all geographies",
extra={
"props": {"app_token": str(app_token)},
},
)

# Decode the app token and validate it.
token = AppTokenFactory()
token.decode_and_validate(db, request, app_token)

try:
world_map_stats = get_world_map_stats(db, token.allowed_corpora_ids)

if world_map_stats == []:
_LOGGER.error("No stats for world map found")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No stats for world map found",
)

return world_map_stats
except RepositoryError as e:
_LOGGER.error(e)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=e.message
)
except ValidationError as e:
_LOGGER.error(e)
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=e.message)
4 changes: 2 additions & 2 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from app.api.api_v1.routers.admin import admin_document_router
from app.api.api_v1.routers.auth import auth_router
from app.api.api_v1.routers.documents import documents_router
from app.api.api_v1.routers.geographies import geographies_router
from app.api.api_v1.routers.lookups import lookups_router
from app.api.api_v1.routers.pipeline_trigger import pipeline_trigger_router
from app.api.api_v1.routers.search import search_router
from app.api.api_v1.routers.summaries import summary_router
from app.api.api_v1.routers.world_map import world_map_router
from app.clients.db.session import SessionLocal, engine
from app.service.auth import get_superuser_details
from app.service.health import is_database_online
Expand Down Expand Up @@ -158,7 +158,7 @@ async def root():
summary_router, prefix="/api/v1", tags=["Summaries"], include_in_schema=False
)
app.include_router(
geographies_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
world_map_router, prefix="/api/v1", tags=["Geographies"], include_in_schema=False
)

# add pagination support to all routes that ask for it
Expand Down
2 changes: 1 addition & 1 deletion app/repository/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_whole_database_dump(
"""Get whole database dump and bind variables.

:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] corpora_ids: The corpora from which we
:param list[str] allowed_corpora_ids: The corpora from which we
should allow the data to be dumped.
:return pd.DataFrame: A DataFrame containing the results of the SQL
query that gets the whole database dump in our desired format.
Expand Down
119 changes: 27 additions & 92 deletions app/repository/geography.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
"""Functions to support the geographies endpoint."""

import logging
import os
from typing import Optional, Sequence

from db_client.models.dfce.family import (
Family,
FamilyDocument,
FamilyGeography,
FamilyStatus,
)
from db_client.models.dfce.family import Family, FamilyDocument, FamilyGeography
from db_client.models.dfce.geography import Geography
from sqlalchemy import func
from sqlalchemy.exc import OperationalError
from sqlalchemy import bindparam, text
from sqlalchemy.orm import Query, Session
from sqlalchemy.types import ARRAY, String

from app.errors import RepositoryError
from app.errors import ValidationError
from app.models.geography import GeographyStatsDTO
from app.repository.helpers import get_query_template

_LOGGER = logging.getLogger(__file__)

Expand Down Expand Up @@ -63,74 +60,32 @@ def get_geo_subquery(
return geo_subquery.subquery("geo_subquery")


def _db_count_fams_in_category_and_geo(db: Session) -> Query:
"""
Query the database for the fam count per category per geo.

NOTE: SqlAlchemy will make a complete hash of query generation if
columns are used in the query() call. Therefore, entire objects are
returned.
def count_families_per_category_in_each_geo(
db: Session, allowed_corpora: list[str]
) -> list[GeographyStatsDTO]:
"""Query the database for the family count per category per geo.

:param Session db: DB Session to perform query on.
:return Query: A Query object containing the queries to perform.
:param list[str] allowed_corpora: The list of allowed corpora IDs to
filter on.
:return list[GeographyStatsDTO]: A list of counts of families by
category per geography.
"""
# Get the required Geography information and cross join each with all of the unique
# family_category values (so if some geographies have no documents for a particular
# family_category, we can set the count for that category to 0).
family_categories = db.query(Family.family_category).distinct().subquery()
geo_family_combinations = db.query(
Geography.id.label("geography_id"),
Geography.display_value,
Geography.slug,
Geography.value,
family_categories.c.family_category,
).subquery("geo_family_combinations")

# Get a count of documents in each present family_category for each geography.
counts = (
db.query(
Family.family_category,
FamilyGeography.geography_id,
func.count().label("records_count"),
)
.join(FamilyGeography, Family.import_id == FamilyGeography.family_import_id)
.filter(Family.family_status == FamilyStatus.PUBLISHED)
.group_by(Family.family_category, FamilyGeography.geography_id)
.subquery("counts")
)
if allowed_corpora in [None, []]:
raise ValidationError("No allowed corpora provided")

# Aggregate family_category counts per geography into a JSONB object, and if a
# family_category count is missing, set the count for that category to 0 so each
# geography will always have a count for all family_category values.
query = (
db.query(
geo_family_combinations.c.display_value.label("display_value"),
geo_family_combinations.c.slug.label("slug"),
geo_family_combinations.c.value.label("value"),
func.jsonb_object_agg(
geo_family_combinations.c.family_category,
func.coalesce(counts.c.records_count, 0),
).label("counts"),
)
.select_from(
geo_family_combinations.join(
counts,
(geo_family_combinations.c.geography_id == counts.c.geography_id)
& (
geo_family_combinations.c.family_category
== counts.c.family_category
),
isouter=True,
)
)
.group_by(
geo_family_combinations.c.display_value,
geo_family_combinations.c.slug,
geo_family_combinations.c.value,
)
.order_by(geo_family_combinations.c.display_value)
query_template = text(
get_query_template(os.path.join("app", "repository", "sql", "world_map.sql"))
)
return query
query_template = query_template.bindparams(
bindparam("allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)),
)

family_geo_stats = db.execute(
query_template, {"allowed_corpora_ids": allowed_corpora}
).all()
results = [_to_dto(fgs) for fgs in family_geo_stats]
return results


def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
Expand All @@ -148,23 +103,3 @@ def _to_dto(family_doc_geo_stats) -> GeographyStatsDTO:
slug=family_doc_geo_stats.slug,
family_counts=family_doc_geo_stats.counts,
)


def get_world_map_stats(db: Session) -> list[GeographyStatsDTO]:
"""
Get a count of fam per category per geography for all geographies.

:param db Session: The database session.
:return list[GeographyStatsDTO]: A list of Geography stats objects
"""
try:
family_geo_stats = _db_count_fams_in_category_and_geo(db).all()
except OperationalError as e:
_LOGGER.error(e)
raise RepositoryError("Error querying the database for geography stats")

if not family_geo_stats:
return []

result = [_to_dto(fgs) for fgs in family_geo_stats]
return result
3 changes: 2 additions & 1 deletion app/repository/sql/download.sql
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ SELECT
n3.event_type_names AS "Full timeline of events (types)",
n3.event_dates AS "Full timeline of events (dates)",
d.created::DATE AS "Date Added to System",
f.last_modified::DATE AS "Last ModIFied on System",
f.last_modified::DATE AS "Last Modified on System",
d.import_id AS "Internal Document ID",
f.import_id AS "Internal Family ID",
n1.collection_import_ids AS "Internal Collection ID(s)",
Expand All @@ -237,6 +237,7 @@ SELECT
type,0}') AS "Document Type",
CASE
WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
WHEN f.family_category = 'MCF' THEN 'MCF'
ELSE INITCAP(f.family_category::TEXT)
END AS "Category",
ARRAY_TO_STRING(
Expand Down
93 changes: 93 additions & 0 deletions app/repository/sql/world_map.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
WITH counts AS (
SELECT
family.family_category,
family_geography.geography_id,
COUNT(*) AS records_count
FROM
family
INNER JOIN
family_corpus
ON family.import_id = family_corpus.family_import_id
INNER JOIN corpus ON family_corpus.corpus_import_id = corpus.import_id
INNER JOIN
family_geography
ON family.import_id = family_geography.family_import_id
WHERE
family_corpus.corpus_import_id = ANY(:allowed_corpora_ids)
AND CASE
WHEN (
NOT (
EXISTS (
SELECT
1
FROM
family_document
WHERE
family.import_id = family_document.family_import_id
)
)
) THEN 'Created'
WHEN (
(
SELECT
COUNT(family_document.document_status) AS count_1
FROM
family_document
WHERE
family_document.family_import_id = family.import_id
AND family_document.document_status = 'PUBLISHED'
) > 0
) THEN 'Published'
WHEN (
(
SELECT
COUNT(family_document.document_status) AS count_2
FROM
family_document
WHERE
family_document.family_import_id = family.import_id
AND family_document.document_status = 'CREATED'
) > 0
) THEN 'Created'
ELSE 'Deleted'
END = 'Published'
GROUP BY
family.family_category,
family_geography.geography_id
)

SELECT
geo_family_combinations.display_value,
geo_family_combinations.slug,
geo_family_combinations.value,
JSONB_OBJECT_AGG(
geo_family_combinations.family_category,
COALESCE(counts.records_count, 0)
) AS counts
FROM
(
SELECT
geography.id AS geography_id,
geography.display_value,
geography.slug,
geography.value,
anon_1.family_category
FROM
geography,
(
SELECT DISTINCT
family.family_category
FROM
family
) AS anon_1
) AS geo_family_combinations
LEFT OUTER JOIN
counts
ON geo_family_combinations.geography_id = counts.geography_id
AND geo_family_combinations.family_category = counts.family_category
GROUP BY
geo_family_combinations.display_value,
geo_family_combinations.slug,
geo_family_combinations.value
ORDER BY
geo_family_combinations.display_value;
Loading
Loading