From b1ec83b158d8d15e67ac19bc4cbdeb46934588aa Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Thu, 28 Nov 2024 11:18:38 +0000
Subject: [PATCH 1/2] fix: exact match should not perform stemming  (#422)

* bump sdk version to 1.9.5

* update vespa schemas in line with prod

* bump version in pyproject.toml to 1.19.14
---
 poetry.lock                                   | 10 +-
 pyproject.toml                                |  4 +-
 .../schemas/document_passage.sd               | 66 ++++++++++++-
 .../schemas/family_document.sd                | 92 +++++++++++++++++++
 4 files changed, 162 insertions(+), 10 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2e443fb5..6d497b9e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -702,13 +702,13 @@ files = [
 
 [[package]]
 name = "cpr-sdk"
-version = "1.9.3"
+version = "1.9.5"
 description = ""
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "cpr_sdk-1.9.3-py3-none-any.whl", hash = "sha256:1cd725de96d3af7a1f74c5d5eab18b207702944549f945e5392a039242c948b8"},
-    {file = "cpr_sdk-1.9.3.tar.gz", hash = "sha256:f7ee60d81b2c9520cae237742582052472b5581601a8bb33862194bae8c4e0e1"},
+    {file = "cpr_sdk-1.9.5-py3-none-any.whl", hash = "sha256:dd32806499b5bb44c98be1f4135b88406f1a77abcf60c7d4f61ac740de979da3"},
+    {file = "cpr_sdk-1.9.5.tar.gz", hash = "sha256:addc22e557381935ac66c95721312c4a37080fda7419381971cdf6e7cb331fe0"},
 ]
 
 [package.dependencies]
@@ -4250,4 +4250,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "40e1911a3a0b0211027e7326d521d1fc9cdf8aa6c798279f1d5134a4bc2a5f57"
+content-hash = "d96225e60602c52c47631f4a12f8519bcb5d1e2f5d2b15c2f57760db6b28c33c"
diff --git a/pyproject.toml b/pyproject.toml
index 11d675fa..e5569fe6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.13"
+version = "1.19.14"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]
@@ -10,7 +10,7 @@ python = "^3.10"
 Authlib = "^0.15.5"
 bcrypt = "^3.2.0"
 boto3 = "^1.26"
-cpr_sdk = { version = "1.9.3", extras = ["vespa"] }
+cpr_sdk = { version = "1.9.5", extras = ["vespa"] }
 fastapi = "^0.104.1"
 fastapi-health = "^0.4.0"
 fastapi-pagination = { extras = ["sqlalchemy"], version = "^0.12.19" }
diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
index 29492a3a..863929db 100644
--- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
+++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/document_passage.sd
@@ -1,5 +1,10 @@
 schema document_passage {
 
+    field text_block_not_stemmed type string {
+        indexing: input text_block | summary | index
+        stemming: none
+    }
+
     document document_passage {
 
         field search_weights_ref type reference<search_weights> {
@@ -134,6 +139,37 @@ schema document_passage {
         summary concepts {}
     }
 
+    document-summary search_summary_with_tokens {
+        summary family_name {}
+        summary family_description {}
+        summary family_import_id {}
+        summary family_slug {}
+        summary family_category {}
+        summary family_publication_ts {}
+        summary family_geography {}
+        summary family_geographies {}
+        summary family_source {}
+        summary document_import_id {}
+        summary document_slug {}
+        summary document_languages {}
+        summary document_content_type {}
+        summary document_cdn_object {}
+        summary document_source_url {}
+        summary corpus_import_id {}
+        summary corpus_type_name {}
+        summary metadata {}
+        summary text_block {}
+        summary text_block_id {}
+        summary text_block_type {}
+        summary text_block_page {}
+        summary text_block_coords {}
+        summary concepts {}
+        summary text_block_tokens {
+            source: text_block
+            tokens
+        }
+    }
+
     rank-profile exact inherits default {
         function text_score() {
             expression: attribute(passage_weight) * fieldMatch(text_block)
@@ -141,7 +177,17 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() fieldMatch(text_block)
+    }
+    
+    rank-profile exact_not_stemmed inherits default {
+        function text_score() {
+            expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed)
+        }
+        first-phase {
+            expression: text_score()
+        }
+        match-features: text_score() fieldMatch(text_block)
     }
 
     rank-profile hybrid_no_closeness inherits default {
@@ -151,7 +197,7 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() bm25(text_block)
     }
 
     rank-profile hybrid inherits default {
@@ -164,6 +210,20 @@ schema document_passage {
         first-phase {
             expression: text_score()
         }
-        match-features: text_score()
+        match-features: text_score() bm25(text_block) closeness(text_embedding)
+    }
+    
+    rank-profile hybrid_custom_weight inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+            query(bm25_weight) double
+        }
+        function text_score() {
+            expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding))
+        }
+        first-phase {
+            expression: text_score()
+        }
+        match-features: text_score() bm25(text_block) closeness(text_embedding)
     }
 }
diff --git a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
index e62d6df5..e56963b2 100644
--- a/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
+++ b/tests/search/vespa/fixtures/vespa_test_schema/schemas/family_document.sd
@@ -1,5 +1,15 @@
 schema family_document {
 
+    field family_name_not_stemmed type string {
+        indexing: input family_name_index | index
+        stemming: none
+    }
+
+    field family_description_not_stemmed type string {
+        indexing: input family_description_index | index
+        stemming: none
+    }
+
     document family_document {
 
         field search_weights_ref type reference<search_weights> {
@@ -170,6 +180,19 @@ schema family_document {
         }
         match-features: name_score() description_score()
     }
+    
+    rank-profile exact_not_stemmed inherits default {
+        function name_score() {
+            expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
 
     rank-profile hybrid_no_closeness inherits default {
         function name_score() {
@@ -199,6 +222,40 @@ schema family_document {
         }
         match-features: name_score() description_score()
     }
+    
+    rank-profile hybrid_no_description_embedding inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+        }
+        function name_score() {
+            expression: attribute(name_weight) * bm25(family_name_index)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * bm25(family_description_index)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
+
+    rank-profile hybrid_custom_weight inherits default {
+        inputs {
+            query(query_embedding) tensor<float>(x[768])
+            query(bm25_weight) double
+        }
+        function name_score() {
+            expression: attribute(name_weight) * bm25(family_name_index)
+        }
+        function description_score() {
+            expression: attribute(description_weight) * bm25(family_description_index)
+        }
+        first-phase {
+            expression: name_score() + description_score()
+        }
+        match-features: name_score() description_score()
+    }
+
 
     document-summary search_summary {
         summary family_name {}
@@ -223,4 +280,39 @@ schema family_document {
         summary collection_title {}
         summary collection_summary {}
     }
+
+    document-summary search_summary_with_tokens {
+        summary family_name {}
+        summary family_description {}
+        summary family_import_id {}
+        summary family_slug {}
+        summary family_category {}
+        summary family_publication_ts {}
+        summary family_geography {}
+        summary family_geographies {}
+        summary family_source {}
+        summary document_import_id {}
+        summary document_title {}
+        summary document_slug {}
+        summary document_languages {}
+        summary document_content_type {}
+        summary document_cdn_object {}
+        summary document_source_url {}
+        summary metadata {}
+        summary corpus_import_id {}
+        summary corpus_type_name {}
+        summary collection_title {}
+        summary collection_summary {}
+        summary family_name_index {}
+        summary family_name_index_tokens {
+            source: family_name_index
+            tokens
+        }
+        summary family_description_index {}
+        summary family_description_index_tokens {
+            source: family_description_index
+            tokens
+        }
+        from-disk
+    }
 }

From 9089898ff4cf0e833910323d5db295bf482b228b Mon Sep 17 00:00:00 2001
From: Katy Baulch <46493669+katybaulch@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:04:37 +0000
Subject: [PATCH 2/2] Add sql linting using SQLFluff (#417)

* Enable sql-formatter

* Fix formatting

* Replace with variables & bind

* Update .git-blame-ignore-revs

* Fix formatting

* Remove sqlformatter & enable sqlfluff

* Fix formatting

* Fix formatting

* Use sqlfluff lint and fix

* Bump pyproject

* Fix Download query placeholders

* Add sql linting

* Fixed slug_lookup query

* Fix query param binding

* Update .sqlfluff

* Rename keyword identifier to non keyword identifier

* Revert makefile changes

* Fix linting errors

* Update download query logic based on linting

* Update formatting

* Remove debug function

* Revert SQLalchemy 2
---
 .git-blame-ignore-revs             |   4 +
 .trunk/configs/.sqlfluff           |  30 ++
 .trunk/trunk.yaml                  |  50 +++
 app/repository/document.py         |  45 +--
 app/repository/download.py         |  46 ++-
 app/repository/helpers.py          |   6 +-
 app/repository/sql/download.sql    | 596 +++++++++++++++++------------
 app/repository/sql/pipeline.sql    | 503 ++++++++++++------------
 app/repository/sql/slug_lookup.sql |  43 ++-
 makefile-docker.defs               |   2 +-
 pyproject.toml                     |   2 +-
 11 files changed, 774 insertions(+), 553 deletions(-)
 create mode 100644 .trunk/configs/.sqlfluff

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 9a66c0e8..8c4c0200 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -23,3 +23,7 @@
 
 # Updating the test data file for document passages to be indent=2
 44624dcd1fa0835708bd9187a39bb0da8a31cd03
+
+# Fix SQL query formatting
+047766a85f086fc0986a6f2b49fee9d73fa219e8
+ab3476708920c5760f058ec40d14d008f94f5bad
diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff
new file mode 100644
index 00000000..de7aacd3
--- /dev/null
+++ b/.trunk/configs/.sqlfluff
@@ -0,0 +1,30 @@
+[sqlfluff]
+dialect = postgres
+exclude_rules = LT02, LT09
+
+[sqlfluff:indentation]
+indented_ctes = True
+
+[sqlfluff:layout:type:colon]
+spacing_before = single
+spacing_after = single
+
+[sqlfluff:layout:type:parameter]
+spacing_before = touch
+spacing_after = any
+
+[sqlfluff:rules:references.special_chars]
+allow_space_in_identifier = True
+additional_allowed_characters = ["/", "_", "-", "(", ")"]
+
+[sqlfluff:rules:capitalisation.keywords]
+capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.identifiers]
+extended_capitalisation_policy = lower
+
+[sqlfluff:rules:capitalisation.functions]
+extended_capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.types]
+extended_capitalisation_policy = upper
diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
index 6d746ff9..31ff5439 100644
--- a/.trunk/trunk.yaml
+++ b/.trunk/trunk.yaml
@@ -5,6 +5,14 @@ version: 0.1
 cli:
   version: 1.22.0
 
+tools:
+  definitions:
+    - name: sqlfluff
+      runtime: python
+      package: sqlfluff
+      shims: [sqlfluff]
+      known_good_version: 1.4.5
+
 # Trunk provides extensibility via plugins.
 # (https://docs.trunk.io/plugins)
 plugins:
@@ -27,6 +35,7 @@ lint:
   disabled:
     - hadolint
     - oxipng
+
   definitions:
     - name: bandit
       direct_configs: [bandit.yaml]
@@ -34,6 +43,45 @@ lint:
         - name: lint
           run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target}
 
+    - name: sqlfluff
+      files: [sql, sql-j2, dml, ddl]
+      tools: [sqlfluff]
+      description: A dialect-flexible and configurable SQL linter
+      known_good_version: 1.4.5
+      direct_configs:
+        - .sqlfluff
+      affects_cache:
+        - pyproject.toml
+      suggest_if: config_present
+      commands:
+        - name: lint
+          run: sqlfluff lint ${target} --format json --nofail
+          output: sarif
+          success_codes: [0]
+          read_output_from: stdout
+          parser:
+            runtime: python
+            run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py
+
+        - name: fix
+          version: ">=3.0.0"
+          run: sqlfluff fix ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
+        - name: format
+          run: sqlfluff format ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
   ignore:
     - linters: [ALL]
       paths:
@@ -45,6 +93,8 @@ lint:
         - LICENSE.md
 
   enabled:
+    - sqlfluff@3.2.5:
+        commands: [lint, fix, format]
     - actionlint@1.6.27
     - bandit@1.7.8
     - black@24.4.2
diff --git a/app/repository/document.py b/app/repository/document.py
index 57579a26..73d19d47 100644
--- a/app/repository/document.py
+++ b/app/repository/document.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Database helper functions for the documents entity."""
 
 import logging
 import os
@@ -22,8 +18,9 @@
 from db_client.models.dfce.metadata import FamilyMetadata
 from db_client.models.document.physical_document import PhysicalDocument
 from db_client.models.organisation.organisation import Organisation
-from sqlalchemy import func
+from sqlalchemy import bindparam, func, text
 from sqlalchemy.orm import Session
+from sqlalchemy.types import ARRAY, String
 
 from app.models.document import (
     CollectionOverviewResponse,
@@ -42,22 +39,6 @@
 _LOGGER = logging.getLogger(__file__)
 
 
-def get_slugged_object_from_allowed_corpora_query(
-    template_query, slug_name: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
-
-    :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
-        should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
-    """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace("{slug_name}", slug_name).replace(  # type: ignore
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
 def get_slugged_objects(
     db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
 ) -> tuple[Optional[str], Optional[str]]:
@@ -74,14 +55,22 @@ def get_slugged_objects(
     :return tuple[Optional[str], Optional[str]]: the FamilyDocument
         import id or the Family import_id.
     """
-    if allowed_corpora is not None:
-        query_template = get_query_template(
-            os.path.join("app", "repository", "sql", "slug_lookup.sql")
+    if allowed_corpora not in [None, []]:
+        query_template = text(
+            get_query_template(
+                os.path.join("app", "repository", "sql", "slug_lookup.sql")
+            )
+        )
+
+        query_template = query_template.bindparams(
+            bindparam("slug_name", type_=String),
+            bindparam(
+                "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)
+            ),
         )
-        query = get_slugged_object_from_allowed_corpora_query(
-            query_template, slug, allowed_corpora
+        query = db.execute(
+            query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora}
         )
-        query = db.execute(query)
     else:
         query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
             Slug.name == slug
diff --git a/app/repository/download.py b/app/repository/download.py
index 1ed90396..33592cc0 100644
--- a/app/repository/download.py
+++ b/app/repository/download.py
@@ -5,6 +5,8 @@
 
 import pandas as pd
 from fastapi import Depends
+from sqlalchemy import bindparam, text
+from sqlalchemy.types import ARRAY, DATETIME, String
 
 from app.clients.db.session import get_db
 from app.repository.helpers import get_query_template
@@ -12,32 +14,34 @@
 _LOGGER = getLogger(__name__)
 
 
-def create_query(
-    template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
+def get_whole_database_dump(
+    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
+):
+    """Get whole database dump and bind variables.
 
     :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
+    :param list[str] corpora_ids: The corpora from which we
         should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
+    :return pd.DataFrame: A DataFrame containing the results of the SQL
+        query that gets the whole database dump in our desired format.
     """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace(  # type: ignore
-        "{ingest_cycle_start}", ingest_cycle_start
-    ).replace(
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
-def get_whole_database_dump(
-    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
-):
-    query_template = get_query_template(
-        os.path.join("app", "repository", "sql", "download.sql")
+    query = text(
+        get_query_template(os.path.join("app", "repository", "sql", "download.sql"))
+    ).bindparams(
+        bindparam("ingest_cycle_start", type_=DATETIME),
+        bindparam(
+            "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String)
+        ),
     )
-    query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
 
     with db.connection() as conn:
-        df = pd.read_sql(query, conn.connection)
+        result = conn.execute(
+            query,
+            {
+                "ingest_cycle_start": ingest_cycle_start,
+                "allowed_corpora_ids": allowed_corpora_ids,
+            },
+        )
+        columns = result.keys()
+        df = pd.DataFrame(result.fetchall(), columns=columns)
         return df
diff --git a/app/repository/helpers.py b/app/repository/helpers.py
index e976683b..958e0b38 100644
--- a/app/repository/helpers.py
+++ b/app/repository/helpers.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Helper functions for the repository layer."""
 
 from functools import lru_cache
 
diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql
index 8a807080..15bbf8ac 100644
--- a/app/repository/sql/download.sql
+++ b/app/repository/sql/download.sql
@@ -1,243 +1,355 @@
-WITH
-deduplicated_family_slugs as (
-  SELECT
-  distinct ON (slug.family_import_id)
-  slug.family_import_id, slug.created, slug.name
-  FROM (
-    SELECT
-    slug.family_import_id as "family_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_import_id is not null
-    group by slug.family_import_id
-    having count(*) > 1
-  ) duplicates
-  left join slug
-  on duplicates.family_import_id = slug.family_import_id
-  order by slug.family_import_id desc, slug.created desc, slug.ctid desc
-),
-unique_family_slugs as (
-  SELECT
-  distinct ON (slug.family_import_id)
-  slug.family_import_id, slug.created, slug.name
-  FROM (
-    SELECT
-    slug.family_import_id as "family_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_import_id is not null
-    group by slug.family_import_id
-    having count(*) = 1
-  ) non_duplicates
-  left join slug
-  on non_duplicates.family_import_id = slug.family_import_id
-  order by slug.family_import_id desc, slug.created desc, slug.ctid desc
-  ), most_recent_family_slugs as (
-  SELECT
-  deduplicated_family_slugs.family_import_id as "family_import_id",
-  deduplicated_family_slugs.created as "created",
-  deduplicated_family_slugs.name as "name"
-  FROM deduplicated_family_slugs
-  UNION ALL
-  SELECT
-  unique_family_slugs.family_import_id as "family_import_id",
-  unique_family_slugs.created as "created",
-  unique_family_slugs.name as "name"
-  FROM unique_family_slugs
-  order by family_import_id desc, created desc
-  ), deduplicated_doc_slugs as (
-  SELECT
-  distinct ON (slug.family_document_import_id)
-  slug.family_document_import_id,
-  slug.created,
-  slug.name
-  FROM (
-    SELECT
-    slug.family_document_import_id as "family_document_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_document_import_id is not null
-    group by slug.family_document_import_id
-    having count(*) >  1
-  ) duplicates
-  left join slug
-  on duplicates.family_document_import_id = slug.family_document_import_id
-  order by
-  slug.family_document_import_id desc, slug.created desc, slug.ctid desc
-),
-unique_doc_slugs as (
-  SELECT
-  distinct ON (slug.family_document_import_id)
-  slug.family_document_import_id,
-  slug.created,
-  slug.name
-  FROM (
-    SELECT
-    slug.family_document_import_id as "family_document_import_id",
-    count(*) as count
-    FROM slug
-    WHERE slug.family_document_import_id is not null
-    group by slug.family_document_import_id
-    having count(*) = 1
-  ) non_duplicates
-  left join slug
-  on non_duplicates.family_document_import_id = slug.family_document_import_id
-  order by
-  slug.family_document_import_id desc, slug.created desc, slug.ctid desc
-  ), most_recent_doc_slugs as (
-  SELECT
-  deduplicated_doc_slugs.family_document_import_id
-  as "family_document_import_id",
-  deduplicated_doc_slugs.created,
-  deduplicated_doc_slugs.name
-  FROM deduplicated_doc_slugs
-  UNION ALL
-  SELECT
-  unique_doc_slugs.family_document_import_id as "family_document_import_id",
-  unique_doc_slugs.created,
-  unique_doc_slugs.name
-  FROM unique_doc_slugs
-  order by family_document_import_id desc, created desc
-  ), event_dates as (
-  SELECT
-      family_event.family_import_id AS family_import_id,
-      CASE
-          WHEN COUNT(*) FILTER (
-              WHERE family_event.event_type_name =
-              (family_event.valid_metadata->'datetime_event_name'->>0)
-          ) > 0 THEN
-              MIN(CASE
-                  WHEN family_event.event_type_name =
-                  (family_event.valid_metadata->'datetime_event_name'->>0)
-                  THEN family_event.date::TIMESTAMPTZ
-              END)
-          ELSE
-              MIN(family_event.date::TIMESTAMPTZ)
-      END AS published_date,
-      max(family_event.date::date) last_changed
-  FROM
-      family_event
-  GROUP BY
-      family_import_id
-)
+WITH deduplicated_family_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_import_id) slug.family_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_import_id
+                HAVING
+                    COUNT(*) > 1
+            ) AS duplicates
+        LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id
+        ORDER BY
+            slug.family_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+unique_family_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_import_id) slug.family_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_import_id
+                HAVING
+                    COUNT(*) = 1
+            ) AS non_duplicates
+        LEFT JOIN
+            slug
+            ON non_duplicates.family_import_id = slug.family_import_id
+        ORDER BY
+            slug.family_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+most_recent_family_slugs AS (
+        SELECT
+            deduplicated_family_slugs.family_import_id,
+            deduplicated_family_slugs.created,
+            deduplicated_family_slugs.name
+        FROM
+            deduplicated_family_slugs
+        UNION ALL
+        SELECT
+            unique_family_slugs.family_import_id,
+            unique_family_slugs.created,
+            unique_family_slugs.name
+        FROM
+            unique_family_slugs
+        ORDER BY
+            family_import_id DESC,
+            created DESC
+    ),
+
+deduplicated_doc_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_document_import_id) slug.family_document_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_document_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_document_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_document_import_id
+                HAVING
+                    COUNT(*) > 1
+            ) AS duplicates
+        LEFT JOIN
+            slug
+            ON
+                duplicates.family_document_import_id
+                = slug.family_document_import_id
+        ORDER BY
+            slug.family_document_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+unique_doc_slugs AS (
+        SELECT DISTINCT
+            ON (slug.family_document_import_id) slug.family_document_import_id,
+            slug.created,
+            slug.name
+        FROM
+            (
+                SELECT
+                    slug.family_document_import_id,
+                    COUNT(*) AS count
+                FROM
+                    slug
+                WHERE
+                    slug.family_document_import_id IS NOT NULL
+                GROUP BY
+                    slug.family_document_import_id
+                HAVING
+                    COUNT(*) = 1
+            ) AS non_duplicates
+        LEFT JOIN
+            slug
+            ON
+                non_duplicates.family_document_import_id
+                = slug.family_document_import_id
+        ORDER BY
+            slug.family_document_import_id DESC,
+            slug.created DESC,
+            slug.ctid DESC
+    ),
+
+most_recent_doc_slugs AS (
+        SELECT
+            deduplicated_doc_slugs.family_document_import_id,
+            deduplicated_doc_slugs.created,
+            deduplicated_doc_slugs.name
+        FROM
+            deduplicated_doc_slugs
+        UNION ALL
+        SELECT
+            unique_doc_slugs.family_document_import_id,
+            unique_doc_slugs.created,
+            unique_doc_slugs.name
+        FROM
+            unique_doc_slugs
+        ORDER BY
+            family_document_import_id DESC,
+            created DESC
+    ),
+
+event_dates AS (
+        SELECT
+            family_event.family_import_id,
+            CASE
+                WHEN COUNT(*) FILTER (
+                    WHERE
+                        family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        )
+                ) > 0 THEN MIN(
+                    CASE
+                        WHEN family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        ) THEN family_event.date::TIMESTAMPTZ
+                    END
+                )
+                ELSE MIN(family_event.date::TIMESTAMPTZ)
+            END AS published_date,
+            MAX(family_event.date::DATE) AS last_changed
+        FROM
+            family_event
+        GROUP BY
+            family_event.family_import_id
+    ),
+
+fg AS (
+        SELECT
+            family_geography.family_import_id,
+            STRING_AGG(geography.value, ';') AS geo_isos,
+            STRING_AGG(geography.display_value, ';') AS geo_display_values
+        FROM
+            geography
+            INNER JOIN
+                family_geography
+                ON geography.id = family_geography.geography_id
+        GROUP BY
+            family_geography.family_import_id
+    ),
+
+n1 AS (
+        SELECT
+            collection_family.family_import_id,
+            STRING_AGG(collection.import_id, ';') AS collection_import_ids,
+            STRING_AGG(collection.title, ';') AS collection_titles,
+            STRING_AGG(collection.description, ';') AS collection_descriptions
+        FROM
+            collection
+            INNER JOIN
+                collection_family
+                ON collection.import_id = collection_family.collection_import_id
+        GROUP BY
+            collection_family.family_import_id
+    )
+
 SELECT
-ds.name as "Document ID",
-p.title as "Document Title",
-fs.name as "Family ID",
-f.title as "Family Title",
-f.description as "Family Summary",
-n1.collection_titles as "Collection Title(s)",
-n1.collection_descriptions as "Collection Description(s)",
-INITCAP(d.valid_metadata::json#>>'{
-  role,0}') as
-"Document Role",
-d.variant_name as "Document Variant",
-p.source_url as "Document Content URL",
-INITCAP(d.valid_metadata::json#>>'{
-  type,0}') as
-"Document Type",
-CASE
-  WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
-  ELSE INITCAP(f.family_category::TEXT)
-END "Category",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'framework')), ';')
-as "Framework",
-n2.language as "Language",
-o.name as "Source",
-fg.geo_isos as "Geography ISOs",
-fg.geo_display_values as "Geographies",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'topic')), ';')
-as "Topic/Response",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'hazard')), ';')
-as "Hazard",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'sector')), ';')
-as "Sector",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'keyword')), ';')
-as "Keyword",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'instrument')), ';')
-as "Instrument",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'author')), ';')
-as "Author",
-array_to_string(ARRAY(
-    SELECT jsonb_array_elements_text(fm.value->'author_type')), ';')
-as "Author Type",
-fp.published_date as "First event in timeline",
-fp.last_changed as "Last event in timeline",
-n3.event_type_names as "Full timeline of events (types)",
-n3.event_dates as "Full timeline of events (dates)",
-d.created::date as "Date Added to System",
-f.last_modified::date as "Last ModIFied on System",
-d.import_id as "Internal Document ID",
-f.import_id as "Internal Family ID",
-n1.collection_import_ids as "Internal Collection ID(s)"
-FROM physical_document p
-JOIN family_document d
-ON p.id = d.physical_document_id
-JOIN family f
-ON d.family_import_id = f.import_id
-FULL JOIN (
-  SELECT
-  family_geography.family_import_id as "family_import_id",
-  string_agg(geography.value, ';') AS geo_isos,
-  string_agg(geography.display_value, ';') AS geo_display_values
-  FROM
-  geography
-  INNER JOIN family_geography
-  ON geography.id = family_geography.geography_id
-  GROUP BY family_geography.family_import_id
-) fg ON fg.family_import_id=f.import_id
-join family_corpus fc
-on f.import_id = fc.family_import_id
-join corpus c
-on fc.corpus_import_id = c.import_id
-join organisation o
-on c.organisation_id = o.id
-join family_metadata fm
-on fm.family_import_id = f.import_id
-FULL JOIN (
-  SELECT
-  collection_family.family_import_id as "family_import_id",
-  string_agg(collection.import_id, ';') AS collection_import_ids,
-  string_agg(collection.title, ';') AS collection_titles,
-  string_agg(collection.description, ';') AS collection_descriptions
-  FROM
-  collection
-  INNER JOIN collection_family
-  ON collection_family.collection_import_id = collection.import_id
-  GROUP BY collection_family.family_import_id
-) n1 ON n1.family_import_id=f.import_id
-left JOIN (
-  SELECT
-  p.id as "id",
-  string_agg(l.name, ';' ORDER BY l.name) AS language
-  FROM physical_document p
-  left join physical_document_language pdl
-  on pdl.document_id = p.id
-  left join language l
-  on l.id = pdl.language_id
-  GROUP  BY p.id
-) n2 ON n2.id=d.physical_document_id
-FULL JOIN (
-  SELECT
-  family_event.family_import_id,
-  string_agg(family_event.import_id, ';') AS event_import_ids,
-  string_agg(family_event.title, ';') AS event_titles,
-  string_agg(family_event.event_type_name, ';') AS event_type_names,
-  string_agg(family_event.date::date::text, ';') AS event_dates
-  FROM family_event
-  INNER JOIN  family ON family.import_id = family_event.family_import_id
-  GROUP BY family_event.family_import_id
-) n3 ON n3.family_import_id=f.import_id
-LEFT JOIN most_recent_doc_slugs ds
-on ds.family_document_import_id = d.import_id
-LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id
-LEFT JOIN event_dates fp on fp.family_import_id = f.import_id
-WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids})
-ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id
+    ds.name AS "Document ID",
+    p.title AS "Document Title",
+    fs.name AS "Family ID",
+    f.title AS "Family Title",
+    f.description AS "Family Summary",
+    n1.collection_titles AS "Collection Title(s)",
+    n1.collection_descriptions AS "Collection Description(s)",
+    d.variant_name AS "Document Variant",
+    p.source_url AS "Document Content URL",
+    language_agg.display_name AS "Language",
+    o.name AS "Source",
+    fg.geo_isos AS "Geography ISOs",
+    fg.geo_display_values AS "Geographies",
+    fp.published_date AS "First event in timeline",
+    fp.last_changed AS "Last event in timeline",
+    n3.event_type_names AS "Full timeline of events (types)",
+    n3.event_dates AS "Full timeline of events (dates)",
+    d.created::DATE AS "Date Added to System",
+    f.last_modified::DATE AS "Last ModIFied on System",
+    d.import_id AS "Internal Document ID",
+    f.import_id AS "Internal Family ID",
+    n1.collection_import_ids AS "Internal Collection ID(s)",
+    INITCAP(d.valid_metadata::JSON #>> '{
+  role,0}') AS "Document Role",
+    INITCAP(d.valid_metadata::JSON #>> '{
+  type,0}') AS "Document Type",
+    CASE
+        WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC'
+        ELSE INITCAP(f.family_category::TEXT)
+    END AS "Category",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework')
+        ),
+        ';'
+    ) AS "Framework",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic')
+        ),
+        ';'
+    ) AS "Topic/Response",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard')
+        ),
+        ';'
+    ) AS "Hazard",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector')
+        ),
+        ';'
+    ) AS "Sector",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword')
+        ),
+        ';'
+    ) AS "Keyword",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument')
+        ),
+        ';'
+    ) AS "Instrument",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author')
+        ),
+        ';'
+    ) AS "Author",
+    ARRAY_TO_STRING(
+        ARRAY(
+            SELECT
+                JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type')
+        ),
+        ';'
+    ) AS "Author Type"
+FROM
+    physical_document AS p
+    INNER JOIN family_document AS d ON p.id = d.physical_document_id
+    INNER JOIN family AS f ON d.family_import_id = f.import_id
+    FULL JOIN fg ON f.import_id = fg.family_import_id
+    INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id
+    INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id
+    INNER JOIN organisation AS o ON c.organisation_id = o.id
+    INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id
+    FULL JOIN n1 ON f.import_id = n1.family_import_id
+    LEFT JOIN (
+        SELECT
+            p.id,
+            STRING_AGG(
+                l.name,
+                ';'
+                ORDER BY
+                    l.name
+            ) AS display_name
+        FROM
+            physical_document AS p
+            LEFT JOIN
+                physical_document_language AS pdl
+                ON p.id = pdl.document_id
+            LEFT JOIN language AS l ON pdl.language_id = l.id
+        GROUP BY
+            p.id
+    ) AS language_agg ON d.physical_document_id = language_agg.id
+    FULL JOIN (
+        SELECT
+            family_event.family_import_id,
+            STRING_AGG(family_event.import_id, ';') AS event_import_ids,
+            STRING_AGG(family_event.title, ';') AS event_titles,
+            STRING_AGG(family_event.event_type_name, ';') AS event_type_names,
+            STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates
+        FROM
+            family_event
+            INNER JOIN
+                family
+                ON family_event.family_import_id = family.import_id
+        GROUP BY
+            family_event.family_import_id
+    ) AS n3 ON f.import_id = n3.family_import_id
+    LEFT JOIN
+        most_recent_doc_slugs AS ds
+        ON d.import_id = ds.family_document_import_id
+    LEFT JOIN
+        most_recent_family_slugs AS fs
+        ON f.import_id = fs.family_import_id
+    LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id
+WHERE
+    d.last_modified < :ingest_cycle_start
+    AND fc.corpus_import_id = ANY(:allowed_corpora_ids)
+ORDER BY
+    d.last_modified DESC,
+    d.created DESC,
+    d.ctid DESC,
+    n1.family_import_id ASC
diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql
index af6023e6..7a5d0e40 100644
--- a/app/repository/sql/pipeline.sql
+++ b/app/repository/sql/pipeline.sql
@@ -1,36 +1,43 @@
-WITH deduplicated_family_slugs AS (   SELECT
-        DISTINCT
-            ON (slug.family_import_id)   slug.family_import_id,
-            slug.created,
-            slug.name
+WITH deduplicated_family_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_import_id)
+        slug.family_import_id,
+        slug.created,
+        slug.name
     FROM
-        (     SELECT
-            slug.family_import_id AS "family_import_id",
-            Count(*) AS count
-        FROM
-            slug
-        WHERE
-            slug.family_import_id IS NOT NULL
-        GROUP BY
-            slug.family_import_id
-        HAVING
-            Count(*) > 1   ) duplicates
-    left join
+        (
+            SELECT
+                slug.family_import_id,
+                COUNT(*) AS count
+            FROM
+                slug
+            WHERE
+                slug.family_import_id IS NOT NULL
+            GROUP BY
+                slug.family_import_id
+            HAVING
+                COUNT(*) > 1
+        ) AS duplicates
+    LEFT JOIN
         slug
-            ON duplicates.family_import_id = slug.family_import_id
+        ON duplicates.family_import_id = slug.family_import_id
     ORDER BY
         slug.family_import_id DESC,
         slug.created DESC,
-        slug.ctid DESC ),
-        unique_family_slugs AS (   SELECT
-            DISTINCT
-                ON (slug.family_import_id)   slug.family_import_id,
-                slug.created,
-                slug.name
-        FROM
-            (     SELECT
-                slug.family_import_id AS "family_import_id",
-                Count(*) AS count
+        slug.ctid DESC
+),
+
+unique_family_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_import_id)
+        slug.family_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_import_id,
+                COUNT(*) AS count
             FROM
                 slug
             WHERE
@@ -38,219 +45,235 @@ WITH deduplicated_family_slugs AS (   SELECT
             GROUP BY
                 slug.family_import_id
             HAVING
-                Count(*) = 1   ) non_duplicates
-        left join
-            slug
-                ON non_duplicates.family_import_id = slug.family_import_id
-        ORDER BY
-            slug.family_import_id DESC,
-            slug.created DESC,
-            slug.ctid DESC   ),
-            most_recent_family_slugs AS (   SELECT
-                deduplicated_family_slugs.family_import_id AS "family_import_id",
-                deduplicated_family_slugs.created AS "created",
-                deduplicated_family_slugs.name AS "name"
-            FROM
-                deduplicated_family_slugs
-            UNION
-            ALL   SELECT
-                unique_family_slugs.family_import_id AS "family_import_id",
-                unique_family_slugs.created AS "created",
-                unique_family_slugs.name AS "name"
+                COUNT(*) = 1
+        ) AS non_duplicates
+    LEFT JOIN
+        slug
+        ON non_duplicates.family_import_id = slug.family_import_id
+    ORDER BY
+        slug.family_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+most_recent_family_slugs AS (
+    SELECT
+        deduplicated_family_slugs.family_import_id,
+        deduplicated_family_slugs.created,
+        deduplicated_family_slugs.name
+    FROM
+        deduplicated_family_slugs
+    UNION ALL
+    SELECT
+        unique_family_slugs.family_import_id,
+        unique_family_slugs.created,
+        unique_family_slugs.name
+    FROM
+        unique_family_slugs
+    ORDER BY
+        family_import_id DESC,
+        created DESC
+),
+
+deduplicated_doc_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_document_import_id)
+        slug.family_document_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_document_import_id,
+                COUNT(*) AS count
             FROM
-                unique_family_slugs
-            ORDER BY
-                family_import_id DESC,
-                created DESC   ), deduplicated_doc_slugs AS (   SELECT
-                DISTINCT
-                    ON (slug.family_document_import_id)   slug.family_document_import_id,
-                    slug.created,
-                    slug.name
+                slug
+            WHERE
+                slug.family_document_import_id IS NOT NULL
+            GROUP BY
+                slug.family_document_import_id
+            HAVING
+                COUNT(*) > 1
+        ) AS duplicates
+    LEFT JOIN
+        slug
+        ON
+            duplicates.family_document_import_id
+            = slug.family_document_import_id
+    ORDER BY
+        slug.family_document_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+unique_doc_slugs AS (
+    SELECT DISTINCT
+    ON (slug.family_document_import_id)
+        slug.family_document_import_id,
+        slug.created,
+        slug.name
+    FROM
+        (
+            SELECT
+                slug.family_document_import_id,
+                COUNT(*) AS count
             FROM
-                (     SELECT
-                    slug.family_document_import_id AS "family_document_import_id",
-                    Count(*) AS count
-                FROM
-                    slug
-                WHERE
-                    slug.family_document_import_id IS NOT NULL
-                GROUP BY
-                    slug.family_document_import_id
-                HAVING
-                    Count(*) >  1   ) duplicates
-            left join
                 slug
-                    ON duplicates.family_document_import_id = slug.family_document_import_id
-            ORDER BY
-                slug.family_document_import_id DESC,
-                slug.created DESC,
-                slug.ctid DESC ),
-                unique_doc_slugs AS (   SELECT
-                    DISTINCT
-                        ON (slug.family_document_import_id)   slug.family_document_import_id,
-                        slug.created,
-                        slug.name
-                FROM
-                    (     SELECT
-                        slug.family_document_import_id AS "family_document_import_id",
-                        Count(*) AS count
-                    FROM
-                        slug
-                    WHERE
-                        slug.family_document_import_id IS NOT NULL
-                    GROUP BY
-                        slug.family_document_import_id
-                    HAVING
-                        Count(*) = 1   ) non_duplicates
-                left join
-                    slug
-                        ON non_duplicates.family_document_import_id = slug.family_document_import_id
-                ORDER BY
-                    slug.family_document_import_id DESC,
-                    slug.created DESC,
-                    slug.ctid DESC   ),
-                    most_recent_doc_slugs AS (
-                        SELECT
-                            deduplicated_doc_slugs.family_document_import_id   AS "family_document_import_id",
-                            deduplicated_doc_slugs.created,
-                            deduplicated_doc_slugs.name
-                        FROM
-                            deduplicated_doc_slugs
-                        UNION
-                        ALL   SELECT
-                            unique_doc_slugs.family_document_import_id AS "family_document_import_id",
-                            unique_doc_slugs.created,
-                            unique_doc_slugs.name
-                        FROM
-                            unique_doc_slugs
-                        ORDER BY
-                            family_document_import_id DESC,
-                            created DESC
-                    ), event_dates AS (
-                        SELECT
-                            family_event.family_import_id AS family_import_id,
-                            CASE
-                                WHEN COUNT(*) FILTER (
-                                    WHERE family_event.event_type_name =
-                                    (family_event.valid_metadata->'datetime_event_name'->>0)
-                                ) > 0 THEN
-                                    MIN(CASE
-                                        WHEN family_event.event_type_name =
-                                        (family_event.valid_metadata->'datetime_event_name'->>0)
-                                        THEN family_event.date::TIMESTAMPTZ
-                                    END)
-                                ELSE
-                                    MIN(family_event.date::TIMESTAMPTZ)
-                            END AS published_date
-                        FROM
-                            family_event
-                        GROUP BY
-                            family_import_id
-                    )  SELECT
-                        f.title AS "family_title",
-                        p.title AS "physical_document_title",
-                        f.description AS "family_description",
-                        CASE
-                            WHEN f.family_category IN ('UNFCCC',
-                            'MCF') THEN Upper(f.family_category::text)
-                            ELSE Initcap(f.family_category::text)
-                        END "family_category",
-                        fp.published_date AS "family_published_date",
-                        d.import_id AS "family_document_import_id",
-                        ds.name AS "family_document_slug",
-                        f.import_id AS "family_import_id",
-                        fs.name AS "family_slug",
-                        p.source_url AS "physical_document_source_url",
-                        d.valid_metadata::json#>>'{type,0}' AS "family_document_type",
-                        o.name AS "organisation_name",
-                        geos.geographies AS "geographies",
-                        c.import_id AS "corpus_import_id",
-                        c.corpus_type_name AS "corpus_type_name",
-                        langs.languages AS "languages",
-                        fm.value AS "family_metadata",
-                        d.valid_metadata AS "family_document_metadata"
-                    FROM
-                        physical_document p
-                    join
-                        family_document d
-                            ON p.id = d.physical_document_id
-                    join
-                        family f
-                            ON d.family_import_id = f.import_id full
-                    join
-                        (
-                            SELECT
-                                family_geography.family_import_id AS "family_import_id",
-                                string_agg(geography.value,
-                                ';') AS geo_isos,
-                                string_agg(geography.display_value,
-                                ';') AS geo_display_values
-                            FROM
-                                geography
-                            inner join
-                                family_geography
-                                    ON geography.id = family_geography.geography_id
-                            GROUP BY
-                                family_geography.family_import_id
-                        ) fg
-                            ON fg.family_import_id=f.import_id
-                    join
-                        family_corpus fc
-                            ON f.import_id = fc.family_import_id
-                    join
-                        corpus c
-                            ON fc.corpus_import_id = c.import_id
-                    join
-                        organisation o
-                            ON c.organisation_id = o.id
-                    join
-                        family_metadata fm
-                            ON fm.family_import_id = f.import_id
-                    left outer join
-                        (
-                            SELECT
-                                family_document.import_id AS family_document_import_id,
-                                json_agg(DISTINCT(LANGUAGE.name)) AS languages
-                            FROM
-                                family_document
-                            join
-                                physical_document_language
-                                    ON physical_document_language.document_id = family_document.physical_document_id
-                            join
-                                LANGUAGE
-                                    ON LANGUAGE.id = physical_document_language.language_id
-                            GROUP BY
-                                family_document.import_id
-                        ) AS langs
-                            ON langs.family_document_import_id = d.import_id
-                    left outer join
-                        (
-                            SELECT
-                                family_geography.family_import_id AS family_import_id,
-                                json_agg(DISTINCT(geography.value)) AS geographies
-                            FROM
-                                family_geography
-                            join
-                                geography
-                                    ON geography.id = family_geography.geography_id
-                            GROUP BY
-                                family_geography.family_import_id
-                        ) AS geos
-                            ON geos.family_import_id = f.import_id
-                    left join
-                        most_recent_doc_slugs ds
-                            ON ds.family_document_import_id = d.import_id
-                    left join
-                        most_recent_family_slugs fs
-                            ON fs.family_import_id = f.import_id
-                    left join
-                        event_dates fp
-                            ON fp.family_import_id = f.import_id
+            WHERE
+                slug.family_document_import_id IS NOT NULL
+            GROUP BY
+                slug.family_document_import_id
+            HAVING
+                COUNT(*) = 1
+        ) AS non_duplicates
+    LEFT JOIN
+        slug
+        ON
+            non_duplicates.family_document_import_id
+            = slug.family_document_import_id
+    ORDER BY
+        slug.family_document_import_id DESC,
+        slug.created DESC,
+        slug.ctid DESC
+),
+
+most_recent_doc_slugs AS (
+    SELECT
+        deduplicated_doc_slugs.family_document_import_id,
+        deduplicated_doc_slugs.created,
+        deduplicated_doc_slugs.name
+    FROM
+        deduplicated_doc_slugs
+    UNION ALL
+    SELECT
+        unique_doc_slugs.family_document_import_id,
+        unique_doc_slugs.created,
+        unique_doc_slugs.name
+    FROM
+        unique_doc_slugs
+    ORDER BY
+        family_document_import_id DESC,
+        created DESC
+),
+
+event_dates AS (
+    SELECT
+        family_event.family_import_id,
+        CASE
+            WHEN
+                COUNT(*) FILTER (
                     WHERE
-                        d.document_status != 'DELETED'
-                        AND fg.family_import_id = f.import_id
-                    ORDER BY
-                        d.last_modified DESC,
-                        d.created DESC,
-                        d.ctid DESC,
-                        f.import_id
+                    family_event.event_type_name = (
+                        family_event.valid_metadata
+                        -> 'datetime_event_name'
+                        ->> 0
+                    )
+                ) > 0
+                THEN MIN(
+                    CASE
+                        WHEN family_event.event_type_name = (
+                            family_event.valid_metadata
+                            -> 'datetime_event_name'
+                            ->> 0
+                        ) THEN family_event.date::TIMESTAMPTZ
+                    END
+                )
+            ELSE MIN(family_event.date::TIMESTAMPTZ)
+        END AS published_date
+    FROM
+        family_event
+    GROUP BY
+        family_event.family_import_id
+),
+
+fg AS (
+    SELECT
+        family_geography.family_import_id,
+        STRING_AGG(geography.value, ';') AS geo_isos,
+        STRING_AGG(geography.display_value, ';') AS geo_display_values
+    FROM
+        geography
+    INNER JOIN
+        family_geography
+        ON geography.id = family_geography.geography_id
+    GROUP BY
+        family_geography.family_import_id
+),
+
+geos AS (
+    SELECT
+        family_geography.family_import_id,
+        JSON_AGG(DISTINCT geography.value) AS geographies
+    FROM
+        family_geography
+    INNER JOIN geography ON family_geography.geography_id = geography.id
+    GROUP BY
+        family_geography.family_import_id
+)
+
+SELECT
+    f.title AS family_title,
+    p.title AS physical_document_title,
+    f.description AS family_description,
+    fp.published_date AS family_published_date,
+    d.import_id AS family_document_import_id,
+    ds.name AS family_document_slug,
+    f.import_id AS family_import_id,
+    fs.name AS family_slug,
+    p.source_url AS physical_document_source_url,
+    o.name AS organisation_name,
+    geos.geographies,
+    c.import_id AS corpus_import_id,
+    c.corpus_type_name,
+    langs.languages,
+    fm.value AS family_metadata,
+    d.valid_metadata AS family_document_metadata,
+    CASE
+        WHEN
+            f.family_category IN ('UNFCCC', 'MCF')
+            THEN UPPER(f.family_category::TEXT)
+        ELSE INITCAP(f.family_category::TEXT)
+    END AS family_category,
+    d.valid_metadata::JSON #>> '{type,0}' AS family_document_type
+FROM
+    physical_document AS p
+INNER JOIN family_document AS d ON p.id = d.physical_document_id
+INNER JOIN family AS f ON d.family_import_id = f.import_id
+FULL JOIN fg ON f.import_id = fg.family_import_id
+INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id
+INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id
+INNER JOIN organisation AS o ON c.organisation_id = o.id
+INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id
+LEFT OUTER JOIN (
+    SELECT
+        family_document.import_id AS family_document_import_id,
+        JSON_AGG(DISTINCT language.name) AS languages
+    FROM
+        family_document
+    INNER JOIN
+        physical_document_language
+        ON
+            family_document.physical_document_id
+            = physical_document_language.document_id
+    INNER JOIN
+        language
+        ON physical_document_language.language_id = language.id
+    GROUP BY
+        family_document.import_id
+) AS langs ON d.import_id = langs.family_document_import_id
+LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id
+LEFT JOIN
+    most_recent_doc_slugs AS ds
+    ON d.import_id = ds.family_document_import_id
+LEFT JOIN
+    most_recent_family_slugs AS fs
+    ON f.import_id = fs.family_import_id
+LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id
+WHERE
+    d.document_status != 'DELETED'
+    AND fg.family_import_id = f.import_id
+ORDER BY
+    d.last_modified DESC,
+    d.created DESC,
+    d.ctid DESC,
+    f.import_id ASC
diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql
index 9d649067..09cb2e69 100644
--- a/app/repository/sql/slug_lookup.sql
+++ b/app/repository/sql/slug_lookup.sql
@@ -1,20 +1,33 @@
-SELECT
-    slug.family_document_import_id, slug.family_import_id
+-- First query for family document slugs
+SELECT DISTINCT
+    slug.family_document_import_id,
+    slug.family_import_id
 FROM slug
-LEFT JOIN family ON family.import_id = slug.family_import_id
-LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
-LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
-WHERE slug.name = '{slug_name}'
-AND corpus.import_id IN ({allowed_corpora_ids})
+    INNER JOIN family_document
+        ON slug.family_document_import_id = family_document.import_id
+    INNER JOIN family
+        ON family_document.family_import_id = family.import_id
+    INNER JOIN family_corpus
+        ON family.import_id = family_corpus.family_import_id
+    INNER JOIN corpus
+        ON family_corpus.corpus_import_id = corpus.import_id
+WHERE
+    slug.name = :slug_name
+    AND corpus.import_id = ANY(:allowed_corpora_ids)
 
 UNION
 
-SELECT
-    slug.family_document_import_id, slug.family_import_id
+-- Second query for family slugs
+SELECT DISTINCT
+    NULL AS family_document_import_id,
+    slug.family_import_id
 FROM slug
-LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id
-LEFT JOIN family ON family.import_id = family_document.family_import_id
-LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id
-LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id
-WHERE slug.name = '{slug_name}'
-AND corpus.import_id IN ({allowed_corpora_ids});
+    INNER JOIN family
+        ON slug.family_import_id = family.import_id
+    INNER JOIN family_corpus
+        ON family.import_id = family_corpus.family_import_id
+    INNER JOIN corpus
+        ON family_corpus.corpus_import_id = corpus.import_id
+WHERE
+    slug.name = :slug_name
+    AND corpus.import_id = ANY(:allowed_corpora_ids)
diff --git a/makefile-docker.defs b/makefile-docker.defs
index 276a67f0..b41a9358 100644
--- a/makefile-docker.defs
+++ b/makefile-docker.defs
@@ -123,7 +123,7 @@ test_non_search:
 	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS}
 
 test:
-	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS}
+	docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS}
 
 # ----------------------------------
 # tasks
diff --git a/pyproject.toml b/pyproject.toml
index e5569fe6..849d3b4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "navigator_backend"
-version = "1.19.14"
+version = "1.19.15"
 description = ""
 authors = ["CPR-dev-team <tech@climatepolicyradar.org>"]
 packages = [{ include = "app" }, { include = "tests" }]