From b1ec83b158d8d15e67ac19bc4cbdeb46934588aa Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Thu, 28 Nov 2024 11:18:38 +0000 Subject: [PATCH 1/2] fix: exact match should not perform stemming (#422) * bump sdk version to 1.9.5 * update vespa schemas in line with prod * bump version in pyproject.toml to 1.19.14 --- poetry.lock | 10 +- pyproject.toml | 4 +- .../schemas/document_passage.sd | 66 ++++++++++++- .../schemas/family_document.sd | 92 +++++++++++++++++++ 4 files changed, 162 insertions(+), 10 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2e443fb5..6d497b9e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -702,13 +702,13 @@ files = [ [[package]] name = "cpr-sdk" -version = "1.9.3" +version = "1.9.5" description = "" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "cpr_sdk-1.9.3-py3-none-any.whl", hash = "sha256:1cd725de96d3af7a1f74c5d5eab18b207702944549f945e5392a039242c948b8"}, - {file = "cpr_sdk-1.9.3.tar.gz", hash = "sha256:f7ee60d81b2c9520cae237742582052472b5581601a8bb33862194bae8c4e0e1"}, + {file = "cpr_sdk-1.9.5-py3-none-any.whl", hash = "sha256:dd32806499b5bb44c98be1f4135b88406f1a77abcf60c7d4f61ac740de979da3"}, + {file = "cpr_sdk-1.9.5.tar.gz", hash = "sha256:addc22e557381935ac66c95721312c4a37080fda7419381971cdf6e7cb331fe0"}, ] [package.dependencies] @@ -4250,4 +4250,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "40e1911a3a0b0211027e7326d521d1fc9cdf8aa6c798279f1d5134a4bc2a5f57" +content-hash = "d96225e60602c52c47631f4a12f8519bcb5d1e2f5d2b15c2f57760db6b28c33c" diff --git a/pyproject.toml b/pyproject.toml index 11d675fa..e5569fe6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.13" +version = "1.19.14" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] @@ -10,7 +10,7 @@ python = "^3.10" Authlib = "^0.15.5" bcrypt = "^3.2.0" boto3 = "^1.26" -cpr_sdk = { version = "1.9.3", extras = ["vespa"] } +cpr_sdk = { version = "1.9.5", extras = ["vespa"] } fastapi = "^0.104.1" fastapi-health = "^0.4.0" fastapi-pagination = { extras = ["sqlalchemy"], version = "^0.12.19" } diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd index 29492a3a..863929db 100644 --- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd +++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd @@ -1,5 +1,10 @@ schema document_passage { + field text_block_not_stemmed type string { + indexing: input text_block | summary | index + stemming: none + } + document document_passage { field search_weights_ref type reference { @@ -134,6 +139,37 @@ schema document_passage { summary concepts {} } + document-summary search_summary_with_tokens { + summary family_name {} + summary family_description {} + summary family_import_id {} + summary family_slug {} + summary family_category {} + summary family_publication_ts {} + summary family_geography {} + summary family_geographies {} + summary family_source {} + summary document_import_id {} + summary document_slug {} + summary document_languages {} + summary document_content_type {} + summary document_cdn_object {} + summary document_source_url {} + summary corpus_import_id {} + summary corpus_type_name {} + summary metadata {} + summary text_block {} + summary text_block_id {} + summary text_block_type {} + summary text_block_page {} + summary text_block_coords {} + summary concepts {} + summary text_block_tokens { + source: text_block + tokens + } + } + rank-profile exact inherits default { function text_score() { expression: attribute(passage_weight) * fieldMatch(text_block) @@ -141,7 +177,17 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() fieldMatch(text_block) + } + + rank-profile exact_not_stemmed inherits default { + function text_score() { + expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed) + } + first-phase { + expression: text_score() + } + match-features: text_score() fieldMatch(text_block) } rank-profile hybrid_no_closeness inherits default { @@ -151,7 +197,7 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() bm25(text_block) } rank-profile hybrid inherits default { @@ -164,6 +210,20 @@ schema document_passage { first-phase { expression: text_score() } - match-features: text_score() + match-features: text_score() bm25(text_block) closeness(text_embedding) + } + + rank-profile hybrid_custom_weight inherits default { + inputs { + query(query_embedding) tensor(x[768]) + query(bm25_weight) double + } + function text_score() { + expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding)) + } + first-phase { + expression: text_score() + } + match-features: text_score() bm25(text_block) closeness(text_embedding) } } diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd index e62d6df5..e56963b2 100644 --- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd +++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd @@ -1,5 +1,15 @@ schema family_document { + field family_name_not_stemmed type string { + indexing: input family_name_index | index + stemming: none + } + + field family_description_not_stemmed type string { + indexing: input family_description_index | index + stemming: none + } + document family_document { field search_weights_ref type reference { @@ -170,6 +180,19 @@ schema family_document { } match-features: name_score() description_score() } + + rank-profile exact_not_stemmed inherits default { + function name_score() { + expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed) + } + function description_score() { + expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } rank-profile hybrid_no_closeness inherits default { function name_score() { @@ -199,6 +222,40 @@ schema family_document { } match-features: name_score() description_score() } + + rank-profile hybrid_no_description_embedding inherits default { + inputs { + query(query_embedding) tensor(x[768]) + } + function name_score() { + expression: attribute(name_weight) * bm25(family_name_index) + } + function description_score() { + expression: attribute(description_weight) * bm25(family_description_index) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } + + rank-profile hybrid_custom_weight inherits default { + inputs { + query(query_embedding) tensor(x[768]) + query(bm25_weight) double + } + function name_score() { + expression: attribute(name_weight) * bm25(family_name_index) + } + function description_score() { + expression: attribute(description_weight) * bm25(family_description_index) + } + first-phase { + expression: name_score() + description_score() + } + match-features: name_score() description_score() + } + document-summary search_summary { summary family_name {} @@ -223,4 +280,39 @@ schema family_document { summary collection_title {} summary collection_summary {} } + + document-summary search_summary_with_tokens { + summary family_name {} + summary family_description {} + summary family_import_id {} + summary family_slug {} + summary family_category {} + summary family_publication_ts {} + summary family_geography {} + summary family_geographies {} + summary family_source {} + summary document_import_id {} + summary document_title {} + summary document_slug {} + summary document_languages {} + summary document_content_type {} + summary document_cdn_object {} + summary document_source_url {} + summary metadata {} + summary corpus_import_id {} + summary corpus_type_name {} + summary collection_title {} + summary collection_summary {} + summary family_name_index {} + summary family_name_index_tokens { + source: family_name_index + tokens + } + summary family_description_index {} + summary family_description_index_tokens { + source: family_description_index + tokens + } + from-disk + } } From 9089898ff4cf0e833910323d5db295bf482b228b Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:04:37 +0000 Subject: [PATCH 2/2] Add sql linting using SQLFluff (#417) * Enable sql-formatter * Fix formatting * Replace with variables & bind * Update .git-blame-ignore-revs * Fix formatting * Remove sqlformatter & enable sqlfluff * Fix formatting * Fix formatting * Use sqlfluff lint and fix * Bump pyproject * Fix Download query placeholders * Add sql linting * Fixed slug_lookup query * Fix query param binding * Update .sqlfluff * Rename keyword identifier to non keyword identifier * Revert makefile changes * Fix linting errors * Update download query logic based on linting * Update formatting * Remove debug function * Revert SQLalchemy 2 --- .git-blame-ignore-revs | 4 + .trunk/configs/.sqlfluff | 30 ++ .trunk/trunk.yaml | 50 +++ app/repository/document.py | 45 +-- app/repository/download.py | 46 ++- app/repository/helpers.py | 6 +- app/repository/sql/download.sql | 596 +++++++++++++++++------------ app/repository/sql/pipeline.sql | 503 ++++++++++++------------ app/repository/sql/slug_lookup.sql | 43 ++- makefile-docker.defs | 2 +- pyproject.toml | 2 +- 11 files changed, 774 insertions(+), 553 deletions(-) create mode 100644 .trunk/configs/.sqlfluff diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 9a66c0e8..8c4c0200 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -23,3 +23,7 @@ # Updating the test data file for document passages to be indent=2 44624dcd1fa0835708bd9187a39bb0da8a31cd03 + +# Fix SQL query formatting +047766a85f086fc0986a6f2b49fee9d73fa219e8 +ab3476708920c5760f058ec40d14d008f94f5bad diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff new file mode 100644 index 00000000..de7aacd3 --- /dev/null +++ b/.trunk/configs/.sqlfluff @@ -0,0 +1,30 @@ +[sqlfluff] +dialect = postgres +exclude_rules = LT02, LT09 + +[sqlfluff:indentation] +indented_ctes = True + +[sqlfluff:layout:type:colon] +spacing_before = single +spacing_after = single + +[sqlfluff:layout:type:parameter] +spacing_before = touch +spacing_after = any + +[sqlfluff:rules:references.special_chars] +allow_space_in_identifier = True +additional_allowed_characters = ["/", "_", "-", "(", ")"] + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.identifiers] +extended_capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.functions] +extended_capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 6d746ff9..31ff5439 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,6 +5,14 @@ version: 0.1 cli: version: 1.22.0 +tools: + definitions: + - name: sqlfluff + runtime: python + package: sqlfluff + shims: [sqlfluff] + known_good_version: 1.4.5 + # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -27,6 +35,7 @@ lint: disabled: - hadolint - oxipng + definitions: - name: bandit direct_configs: [bandit.yaml] @@ -34,6 +43,45 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} + - name: sqlfluff + files: [sql, sql-j2, dml, ddl] + tools: [sqlfluff] + description: A dialect-flexible and configurable SQL linter + known_good_version: 1.4.5 + direct_configs: + - .sqlfluff + affects_cache: + - pyproject.toml + suggest_if: config_present + commands: + - name: lint + run: sqlfluff lint ${target} --format json --nofail + output: sarif + success_codes: [0] + read_output_from: stdout + parser: + runtime: python + run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py + + - name: fix + version: ">=3.0.0" + run: sqlfluff fix ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + + - name: format + run: sqlfluff format ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + ignore: - linters: [ALL] paths: @@ -45,6 +93,8 @@ lint: - LICENSE.md enabled: + - sqlfluff@3.2.5: + commands: [lint, fix, format] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 diff --git a/app/repository/document.py b/app/repository/document.py index 57579a26..73d19d47 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Database helper functions for the documents entity.""" import logging import os @@ -22,8 +18,9 @@ from db_client.models.dfce.metadata import FamilyMetadata from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation -from sqlalchemy import func +from sqlalchemy import bindparam, func, text from sqlalchemy.orm import Session +from sqlalchemy.types import ARRAY, String from app.models.document import ( CollectionOverviewResponse, @@ -42,22 +39,6 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_object_from_allowed_corpora_query( - template_query, slug_name: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace("{slug_name}", slug_name).replace( # type: ignore - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: @@ -74,14 +55,22 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - if allowed_corpora is not None: - query_template = get_query_template( - os.path.join("app", "repository", "sql", "slug_lookup.sql") + if allowed_corpora not in [None, []]: + query_template = text( + get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) + ) + + query_template = query_template.bindparams( + bindparam("slug_name", type_=String), + bindparam( + "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String) + ), ) - query = get_slugged_object_from_allowed_corpora_query( - query_template, slug, allowed_corpora + query = db.execute( + query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} ) - query = db.execute(query) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( Slug.name == slug diff --git a/app/repository/download.py b/app/repository/download.py index 1ed90396..33592cc0 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -5,6 +5,8 @@ import pandas as pd from fastapi import Depends +from sqlalchemy import bindparam, text +from sqlalchemy.types import ARRAY, DATETIME, String from app.clients.db.session import get_db from app.repository.helpers import get_query_template @@ -12,32 +14,34 @@ _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we + :param list[str] corpora_ids: The corpora from which we should allow the data to be dumped. - :return str: The SQL query to perform on the database session. + :return pd.DataFrame: A DataFrame containing the results of the SQL + query that gets the whole database dump in our desired format. """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - -def get_whole_database_dump( - ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) -): - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") + query = text( + get_query_template(os.path.join("app", "repository", "sql", "download.sql")) + ).bindparams( + bindparam("ingest_cycle_start", type_=DATETIME), + bindparam( + "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String) + ), ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) with db.connection() as conn: - df = pd.read_sql(query, conn.connection) + result = conn.execute( + query, + { + "ingest_cycle_start": ingest_cycle_start, + "allowed_corpora_ids": allowed_corpora_ids, + }, + ) + columns = result.keys() + df = pd.DataFrame(result.fetchall(), columns=columns) return df diff --git a/app/repository/helpers.py b/app/repository/helpers.py index e976683b..958e0b38 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Helper functions for the repository layer.""" from functools import lru_cache diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 8a807080..15bbf8ac 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,243 +1,355 @@ -WITH -deduplicated_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc -), -unique_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_family_slugs as ( - SELECT - deduplicated_family_slugs.family_import_id as "family_import_id", - deduplicated_family_slugs.created as "created", - deduplicated_family_slugs.name as "name" - FROM deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id as "family_import_id", - unique_family_slugs.created as "created", - unique_family_slugs.name as "name" - FROM unique_family_slugs - order by family_import_id desc, created desc - ), deduplicated_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc -), -unique_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_doc_slugs as ( - SELECT - deduplicated_doc_slugs.family_document_import_id - as "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id as "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM unique_doc_slugs - order by family_document_import_id desc, created desc - ), event_dates as ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date, - max(family_event.date::date) last_changed - FROM - family_event - GROUP BY - family_import_id -) +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date, + MAX(family_event.date::DATE) AS last_changed + FROM + family_event + GROUP BY + family_event.family_import_id + ), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ), + +n1 AS ( + SELECT + collection_family.family_import_id, + STRING_AGG(collection.import_id, ';') AS collection_import_ids, + STRING_AGG(collection.title, ';') AS collection_titles, + STRING_AGG(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN + collection_family + ON collection.import_id = collection_family.collection_import_id + GROUP BY + collection_family.family_import_id + ) + SELECT -ds.name as "Document ID", -p.title as "Document Title", -fs.name as "Family ID", -f.title as "Family Title", -f.description as "Family Summary", -n1.collection_titles as "Collection Title(s)", -n1.collection_descriptions as "Collection Description(s)", -INITCAP(d.valid_metadata::json#>>'{ - role,0}') as -"Document Role", -d.variant_name as "Document Variant", -p.source_url as "Document Content URL", -INITCAP(d.valid_metadata::json#>>'{ - type,0}') as -"Document Type", -CASE - WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' - ELSE INITCAP(f.family_category::TEXT) -END "Category", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'framework')), ';') -as "Framework", -n2.language as "Language", -o.name as "Source", -fg.geo_isos as "Geography ISOs", -fg.geo_display_values as "Geographies", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'topic')), ';') -as "Topic/Response", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'hazard')), ';') -as "Hazard", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'sector')), ';') -as "Sector", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'keyword')), ';') -as "Keyword", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'instrument')), ';') -as "Instrument", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author')), ';') -as "Author", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author_type')), ';') -as "Author Type", -fp.published_date as "First event in timeline", -fp.last_changed as "Last event in timeline", -n3.event_type_names as "Full timeline of events (types)", -n3.event_dates as "Full timeline of events (dates)", -d.created::date as "Date Added to System", -f.last_modified::date as "Last ModIFied on System", -d.import_id as "Internal Document ID", -f.import_id as "Internal Family ID", -n1.collection_import_ids as "Internal Collection ID(s)" -FROM physical_document p -JOIN family_document d -ON p.id = d.physical_document_id -JOIN family f -ON d.family_import_id = f.import_id -FULL JOIN ( - SELECT - family_geography.family_import_id as "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography - ON geography.id = family_geography.geography_id - GROUP BY family_geography.family_import_id -) fg ON fg.family_import_id=f.import_id -join family_corpus fc -on f.import_id = fc.family_import_id -join corpus c -on fc.corpus_import_id = c.import_id -join organisation o -on c.organisation_id = o.id -join family_metadata fm -on fm.family_import_id = f.import_id -FULL JOIN ( - SELECT - collection_family.family_import_id as "family_import_id", - string_agg(collection.import_id, ';') AS collection_import_ids, - string_agg(collection.title, ';') AS collection_titles, - string_agg(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family - ON collection_family.collection_import_id = collection.import_id - GROUP BY collection_family.family_import_id -) n1 ON n1.family_import_id=f.import_id -left JOIN ( - SELECT - p.id as "id", - string_agg(l.name, ';' ORDER BY l.name) AS language - FROM physical_document p - left join physical_document_language pdl - on pdl.document_id = p.id - left join language l - on l.id = pdl.language_id - GROUP BY p.id -) n2 ON n2.id=d.physical_document_id -FULL JOIN ( - SELECT - family_event.family_import_id, - string_agg(family_event.import_id, ';') AS event_import_ids, - string_agg(family_event.title, ';') AS event_titles, - string_agg(family_event.event_type_name, ';') AS event_type_names, - string_agg(family_event.date::date::text, ';') AS event_dates - FROM family_event - INNER JOIN family ON family.import_id = family_event.family_import_id - GROUP BY family_event.family_import_id -) n3 ON n3.family_import_id=f.import_id -LEFT JOIN most_recent_doc_slugs ds -on ds.family_document_import_id = d.import_id -LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id -LEFT JOIN event_dates fp on fp.family_import_id = f.import_id -WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids}) -ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id + ds.name AS "Document ID", + p.title AS "Document Title", + fs.name AS "Family ID", + f.title AS "Family Title", + f.description AS "Family Summary", + n1.collection_titles AS "Collection Title(s)", + n1.collection_descriptions AS "Collection Description(s)", + d.variant_name AS "Document Variant", + p.source_url AS "Document Content URL", + language_agg.display_name AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::DATE AS "Date Added to System", + f.last_modified::DATE AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)", + INITCAP(d.valid_metadata::JSON #>> '{ + role,0}') AS "Document Role", + INITCAP(d.valid_metadata::JSON #>> '{ + type,0}') AS "Document Type", + CASE + WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + ELSE INITCAP(f.family_category::TEXT) + END AS "Category", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework') + ), + ';' + ) AS "Framework", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic') + ), + ';' + ) AS "Topic/Response", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard') + ), + ';' + ) AS "Hazard", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector') + ), + ';' + ) AS "Sector", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword') + ), + ';' + ) AS "Keyword", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument') + ), + ';' + ) AS "Instrument", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author') + ), + ';' + ) AS "Author", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type') + ), + ';' + ) AS "Author Type" +FROM + physical_document AS p + INNER JOIN family_document AS d ON p.id = d.physical_document_id + INNER JOIN family AS f ON d.family_import_id = f.import_id + FULL JOIN fg ON f.import_id = fg.family_import_id + INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id + INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id + INNER JOIN organisation AS o ON c.organisation_id = o.id + INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id + FULL JOIN n1 ON f.import_id = n1.family_import_id + LEFT JOIN ( + SELECT + p.id, + STRING_AGG( + l.name, + ';' + ORDER BY + l.name + ) AS display_name + FROM + physical_document AS p + LEFT JOIN + physical_document_language AS pdl + ON p.id = pdl.document_id + LEFT JOIN language AS l ON pdl.language_id = l.id + GROUP BY + p.id + ) AS language_agg ON d.physical_document_id = language_agg.id + FULL JOIN ( + SELECT + family_event.family_import_id, + STRING_AGG(family_event.import_id, ';') AS event_import_ids, + STRING_AGG(family_event.title, ';') AS event_titles, + STRING_AGG(family_event.event_type_name, ';') AS event_type_names, + STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates + FROM + family_event + INNER JOIN + family + ON family_event.family_import_id = family.import_id + GROUP BY + family_event.family_import_id + ) AS n3 ON f.import_id = n3.family_import_id + LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id + LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id + LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.last_modified < :ingest_cycle_start + AND fc.corpus_import_id = ANY(:allowed_corpora_ids) +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + n1.family_import_id ASC diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index af6023e6..7a5d0e40 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,36 +1,43 @@ -WITH deduplicated_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - Count(*) > 1 ) duplicates - left join + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug - ON duplicates.family_import_id = slug.family_import_id + ON duplicates.family_import_id = slug.family_import_id ORDER BY slug.family_import_id DESC, slug.created DESC, - slug.ctid DESC ), - unique_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + slug.ctid DESC +), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count FROM slug WHERE @@ -38,219 +45,235 @@ WITH deduplicated_family_slugs AS ( SELECT GROUP BY slug.family_import_id HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_family_slugs AS ( SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION - ALL SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC +), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC ), deduplicated_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) > 1 ) duplicates - left join slug - ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - unique_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION - ALL SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", - CASE - WHEN f.family_category IN ('UNFCCC', - 'MCF') THEN Upper(f.family_category::text) - ELSE Initcap(f.family_category::text) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json#>>'{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" - FROM - physical_document p - join - family_document d - ON p.id = d.physical_document_id - join - family f - ON d.family_import_id = f.import_id full - join - ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, - ';') AS geo_isos, - string_agg(geography.display_value, - ';') AS geo_display_values - FROM - geography - inner join - family_geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg - ON fg.family_import_id=f.import_id - join - family_corpus fc - ON f.import_id = fc.family_import_id - join - corpus c - ON fc.corpus_import_id = c.import_id - join - organisation o - ON c.organisation_id = o.id - join - family_metadata fm - ON fm.family_import_id = f.import_id - left outer join - ( - SELECT - family_document.import_id AS family_document_import_id, - json_agg(DISTINCT(LANGUAGE.name)) AS languages - FROM - family_document - join - physical_document_language - ON physical_document_language.document_id = family_document.physical_document_id - join - LANGUAGE - ON LANGUAGE.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs - ON langs.family_document_import_id = d.import_id - left outer join - ( - SELECT - family_geography.family_import_id AS family_import_id, - json_agg(DISTINCT(geography.value)) AS geographies - FROM - family_geography - join - geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos - ON geos.family_import_id = f.import_id - left join - most_recent_doc_slugs ds - ON ds.family_document_import_id = d.import_id - left join - most_recent_family_slugs fs - ON fs.family_import_id = f.import_id - left join - event_dates fp - ON fp.family_import_id = f.import_id + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC +), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN + COUNT(*) FILTER ( WHERE - d.document_status != 'DELETED' - AND fg.family_import_id = f.import_id - ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - f.import_id + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 + THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event + GROUP BY + family_event.family_import_id +), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id +), + +geos AS ( + SELECT + family_geography.family_import_id, + JSON_AGG(DISTINCT geography.value) AS geographies + FROM + family_geography + INNER JOIN geography ON family_geography.geography_id = geography.id + GROUP BY + family_geography.family_import_id +) + +SELECT + f.title AS family_title, + p.title AS physical_document_title, + f.description AS family_description, + fp.published_date AS family_published_date, + d.import_id AS family_document_import_id, + ds.name AS family_document_slug, + f.import_id AS family_import_id, + fs.name AS family_slug, + p.source_url AS physical_document_source_url, + o.name AS organisation_name, + geos.geographies, + c.import_id AS corpus_import_id, + c.corpus_type_name, + langs.languages, + fm.value AS family_metadata, + d.valid_metadata AS family_document_metadata, + CASE + WHEN + f.family_category IN ('UNFCCC', 'MCF') + THEN UPPER(f.family_category::TEXT) + ELSE INITCAP(f.family_category::TEXT) + END AS family_category, + d.valid_metadata::JSON #>> '{type,0}' AS family_document_type +FROM + physical_document AS p +INNER JOIN family_document AS d ON p.id = d.physical_document_id +INNER JOIN family AS f ON d.family_import_id = f.import_id +FULL JOIN fg ON f.import_id = fg.family_import_id +INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id +INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id +INNER JOIN organisation AS o ON c.organisation_id = o.id +INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id +LEFT OUTER JOIN ( + SELECT + family_document.import_id AS family_document_import_id, + JSON_AGG(DISTINCT language.name) AS languages + FROM + family_document + INNER JOIN + physical_document_language + ON + family_document.physical_document_id + = physical_document_language.document_id + INNER JOIN + language + ON physical_document_language.language_id = language.id + GROUP BY + family_document.import_id +) AS langs ON d.import_id = langs.family_document_import_id +LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id +LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id +LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id +LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.document_status != 'DELETED' + AND fg.family_import_id = f.import_id +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + f.import_id ASC diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 9d649067..09cb2e69 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,20 +1,33 @@ -SELECT - slug.family_document_import_id, slug.family_import_id +-- First query for family document slugs +SELECT DISTINCT + slug.family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family ON family.import_id = slug.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}) + INNER JOIN family_document + ON slug.family_document_import_id = family_document.import_id + INNER JOIN family + ON family_document.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) UNION -SELECT - slug.family_document_import_id, slug.family_import_id +-- Second query for family slugs +SELECT DISTINCT + NULL AS family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id -LEFT JOIN family ON family.import_id = family_document.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}); + INNER JOIN family + ON slug.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) diff --git a/makefile-docker.defs b/makefile-docker.defs index 276a67f0..b41a9358 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -123,7 +123,7 @@ test_non_search: docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS} # ---------------------------------- # tasks diff --git a/pyproject.toml b/pyproject.toml index e5569fe6..849d3b4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.14" +version = "1.19.15" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }]