Add sql linting using SQLFluff (#417)

* Enable sql-formatter * Fix formatting * Replace with variables & bind * Update .git-blame-ignore-revs * Fix formatting * Remove sqlformatter & enable sqlfluff * Fix formatting * Fix formatting * Use sqlfluff lint and fix * Bump pyproject * Fix Download query placeholders * Add sql linting * Fixed slug_lookup query * Fix query param binding * Update .sqlfluff * Rename keyword identifier to non keyword identifier * Revert makefile changes * Fix linting errors * Update download query logic based on linting * Update formatting * Remove debug function * Revert SQLalchemy 2
climatepolicyradar · Nov 28, 2024 · 9089898 · 9089898
1 parent b1ec83b
commit 9089898
Show file tree

Hide file tree

Showing 11 changed files with 774 additions and 553 deletions.
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -23,3 +23,7 @@
 
 # Updating the test data file for document passages to be indent=2
 44624dcd1fa0835708bd9187a39bb0da8a31cd03
+
+# Fix SQL query formatting
+047766a85f086fc0986a6f2b49fee9d73fa219e8
+ab3476708920c5760f058ec40d14d008f94f5bad
diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff
@@ -0,0 +1,30 @@
+[sqlfluff]
+dialect = postgres
+exclude_rules = LT02, LT09
+
+[sqlfluff:indentation]
+indented_ctes = True
+
+[sqlfluff:layout:type:colon]
+spacing_before = single
+spacing_after = single
+
+[sqlfluff:layout:type:parameter]
+spacing_before = touch
+spacing_after = any
+
+[sqlfluff:rules:references.special_chars]
+allow_space_in_identifier = True
+additional_allowed_characters = ["/", "_", "-", "(", ")"]
+
+[sqlfluff:rules:capitalisation.keywords]
+capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.identifiers]
+extended_capitalisation_policy = lower
+
+[sqlfluff:rules:capitalisation.functions]
+extended_capitalisation_policy = upper
+
+[sqlfluff:rules:capitalisation.types]
+extended_capitalisation_policy = upper
diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
@@ -5,6 +5,14 @@ version: 0.1
 cli:
   version: 1.22.0
 
+tools:
+  definitions:
+    - name: sqlfluff
+      runtime: python
+      package: sqlfluff
+      shims: [sqlfluff]
+      known_good_version: 1.4.5
+
 # Trunk provides extensibility via plugins.
 # (https://docs.trunk.io/plugins)
 plugins:
@@ -27,13 +35,53 @@ lint:
   disabled:
     - hadolint
     - oxipng
+
   definitions:
     - name: bandit
       direct_configs: [bandit.yaml]
       commands:
         - name: lint
           run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target}
 
+    - name: sqlfluff
+      files: [sql, sql-j2, dml, ddl]
+      tools: [sqlfluff]
+      description: A dialect-flexible and configurable SQL linter
+      known_good_version: 1.4.5
+      direct_configs:
+        - .sqlfluff
+      affects_cache:
+        - pyproject.toml
+      suggest_if: config_present
+      commands:
+        - name: lint
+          run: sqlfluff lint ${target} --format json --nofail
+          output: sarif
+          success_codes: [0]
+          read_output_from: stdout
+          parser:
+            runtime: python
+            run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py
+
+        - name: fix
+          version: ">=3.0.0"
+          run: sqlfluff fix ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
+        - name: format
+          run: sqlfluff format ${target} --disable-progress-bar
+          output: rewrite
+          formatter: true
+          in_place: true
+          success_codes: [0, 1]
+          enabled: false
+          batch: true
+
   ignore:
     - linters: [ALL]
       paths:
@@ -45,6 +93,8 @@ lint:
         - LICENSE.md
 
   enabled:
+    - [email protected]:
+        commands: [lint, fix, format]
     - [email protected]
     - [email protected]
     - [email protected]

diff --git a/app/repository/document.py b/app/repository/document.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Database helper functions for the documents entity."""
 
 import logging
 import os
@@ -22,8 +18,9 @@
 from db_client.models.dfce.metadata import FamilyMetadata
 from db_client.models.document.physical_document import PhysicalDocument
 from db_client.models.organisation.organisation import Organisation
-from sqlalchemy import func
+from sqlalchemy import bindparam, func, text
 from sqlalchemy.orm import Session
+from sqlalchemy.types import ARRAY, String
 
 from app.models.document import (
     CollectionOverviewResponse,
@@ -42,22 +39,6 @@
 _LOGGER = logging.getLogger(__file__)
 
 
-def get_slugged_object_from_allowed_corpora_query(
-    template_query, slug_name: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
-
-    :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
-        should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
-    """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace("{slug_name}", slug_name).replace(  # type: ignore
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
 def get_slugged_objects(
     db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
 ) -> tuple[Optional[str], Optional[str]]:
@@ -74,14 +55,22 @@ def get_slugged_objects(
     :return tuple[Optional[str], Optional[str]]: the FamilyDocument
         import id or the Family import_id.
     """
-    if allowed_corpora is not None:
-        query_template = get_query_template(
-            os.path.join("app", "repository", "sql", "slug_lookup.sql")
+    if allowed_corpora not in [None, []]:
+        query_template = text(
+            get_query_template(
+                os.path.join("app", "repository", "sql", "slug_lookup.sql")
+            )
+        )
+
+        query_template = query_template.bindparams(
+            bindparam("slug_name", type_=String),
+            bindparam(
+                "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)
+            ),
         )
-        query = get_slugged_object_from_allowed_corpora_query(
-            query_template, slug, allowed_corpora
+        query = db.execute(
+            query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora}
         )
-        query = db.execute(query)
     else:
         query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
             Slug.name == slug

diff --git a/app/repository/download.py b/app/repository/download.py
@@ -5,39 +5,43 @@
 
 import pandas as pd
 from fastapi import Depends
+from sqlalchemy import bindparam, text
+from sqlalchemy.types import ARRAY, DATETIME, String
 
 from app.clients.db.session import get_db
 from app.repository.helpers import get_query_template
 
 _LOGGER = getLogger(__name__)
 
 
-def create_query(
-    template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
-) -> str:
-    """Create download whole database query, replacing variables.
+def get_whole_database_dump(
+    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
+):
+    """Get whole database dump and bind variables.
 
     :param str ingest_cycle_start: The current ingest cycle date.
-    :param list[str] allowed_corpora_ids: The corpora from which we
+    :param list[str] corpora_ids: The corpora from which we
         should allow the data to be dumped.
-    :return str: The SQL query to perform on the database session.
+    :return pd.DataFrame: A DataFrame containing the results of the SQL
+        query that gets the whole database dump in our desired format.
     """
-    corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
-    return template_query.replace(  # type: ignore
-        "{ingest_cycle_start}", ingest_cycle_start
-    ).replace(
-        "{allowed_corpora_ids}", corpora_ids
-    )  # type: ignore
-
-
-def get_whole_database_dump(
-    ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
-):
-    query_template = get_query_template(
-        os.path.join("app", "repository", "sql", "download.sql")
+    query = text(
+        get_query_template(os.path.join("app", "repository", "sql", "download.sql"))
+    ).bindparams(
+        bindparam("ingest_cycle_start", type_=DATETIME),
+        bindparam(
+            "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String)
+        ),
     )
-    query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)
 
     with db.connection() as conn:
-        df = pd.read_sql(query, conn.connection)
+        result = conn.execute(
+            query,
+            {
+                "ingest_cycle_start": ingest_cycle_start,
+                "allowed_corpora_ids": allowed_corpora_ids,
+            },
+        )
+        columns = result.keys()
+        df = pd.DataFrame(result.fetchall(), columns=columns)
         return df
diff --git a/app/repository/helpers.py b/app/repository/helpers.py
@@ -1,8 +1,4 @@
-"""
-Functions to support the documents endpoints
-
-old functions (non DFC) are moved to the deprecated_documents.py file.
-"""
+"""Helper functions for the repository layer."""
 
 from functools import lru_cache