Skip to content

Commit

Permalink
Add sql linting using SQLFluff (#417)
Browse files Browse the repository at this point in the history
* Enable sql-formatter

* Fix formatting

* Replace with variables & bind

* Update .git-blame-ignore-revs

* Fix formatting

* Remove sqlformatter & enable sqlfluff

* Fix formatting

* Fix formatting

* Use sqlfluff lint and fix

* Bump pyproject

* Fix Download query placeholders

* Add sql linting

* Fixed slug_lookup query

* Fix query param binding

* Update .sqlfluff

* Rename keyword identifier to non keyword identifier

* Revert makefile changes

* Fix linting errors

* Update download query logic based on linting

* Update formatting

* Remove debug function

* Revert SQLalchemy 2
  • Loading branch information
katybaulch authored Nov 28, 2024
1 parent b1ec83b commit 9089898
Show file tree
Hide file tree
Showing 11 changed files with 774 additions and 553 deletions.
4 changes: 4 additions & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@

# Updating the test data file for document passages to be indent=2
44624dcd1fa0835708bd9187a39bb0da8a31cd03

# Fix SQL query formatting
047766a85f086fc0986a6f2b49fee9d73fa219e8
ab3476708920c5760f058ec40d14d008f94f5bad
30 changes: 30 additions & 0 deletions .trunk/configs/.sqlfluff
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[sqlfluff]
dialect = postgres
exclude_rules = LT02, LT09

[sqlfluff:indentation]
indented_ctes = True

[sqlfluff:layout:type:colon]
spacing_before = single
spacing_after = single

[sqlfluff:layout:type:parameter]
spacing_before = touch
spacing_after = any

[sqlfluff:rules:references.special_chars]
allow_space_in_identifier = True
additional_allowed_characters = ["/", "_", "-", "(", ")"]

[sqlfluff:rules:capitalisation.keywords]
capitalisation_policy = upper

[sqlfluff:rules:capitalisation.identifiers]
extended_capitalisation_policy = lower

[sqlfluff:rules:capitalisation.functions]
extended_capitalisation_policy = upper

[sqlfluff:rules:capitalisation.types]
extended_capitalisation_policy = upper
50 changes: 50 additions & 0 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ version: 0.1
cli:
version: 1.22.0

tools:
definitions:
- name: sqlfluff
runtime: python
package: sqlfluff
shims: [sqlfluff]
known_good_version: 1.4.5

# Trunk provides extensibility via plugins.
# (https://docs.trunk.io/plugins)
plugins:
Expand All @@ -27,13 +35,53 @@ lint:
disabled:
- hadolint
- oxipng

definitions:
- name: bandit
direct_configs: [bandit.yaml]
commands:
- name: lint
run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target}

- name: sqlfluff
files: [sql, sql-j2, dml, ddl]
tools: [sqlfluff]
description: A dialect-flexible and configurable SQL linter
known_good_version: 1.4.5
direct_configs:
- .sqlfluff
affects_cache:
- pyproject.toml
suggest_if: config_present
commands:
- name: lint
run: sqlfluff lint ${target} --format json --nofail
output: sarif
success_codes: [0]
read_output_from: stdout
parser:
runtime: python
run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py

- name: fix
version: ">=3.0.0"
run: sqlfluff fix ${target} --disable-progress-bar
output: rewrite
formatter: true
in_place: true
success_codes: [0, 1]
enabled: false
batch: true

- name: format
run: sqlfluff format ${target} --disable-progress-bar
output: rewrite
formatter: true
in_place: true
success_codes: [0, 1]
enabled: false
batch: true

ignore:
- linters: [ALL]
paths:
Expand All @@ -45,6 +93,8 @@ lint:
- LICENSE.md

enabled:
- [email protected]:
commands: [lint, fix, format]
- [email protected]
- [email protected]
- [email protected]
Expand Down
45 changes: 17 additions & 28 deletions app/repository/document.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
"""
Functions to support the documents endpoints
old functions (non DFC) are moved to the deprecated_documents.py file.
"""
"""Database helper functions for the documents entity."""

import logging
import os
Expand All @@ -22,8 +18,9 @@
from db_client.models.dfce.metadata import FamilyMetadata
from db_client.models.document.physical_document import PhysicalDocument
from db_client.models.organisation.organisation import Organisation
from sqlalchemy import func
from sqlalchemy import bindparam, func, text
from sqlalchemy.orm import Session
from sqlalchemy.types import ARRAY, String

from app.models.document import (
CollectionOverviewResponse,
Expand All @@ -42,22 +39,6 @@
_LOGGER = logging.getLogger(__file__)


def get_slugged_object_from_allowed_corpora_query(
template_query, slug_name: str, allowed_corpora_ids: list[str]
) -> str:
"""Create download whole database query, replacing variables.
:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] allowed_corpora_ids: The corpora from which we
should allow the data to be dumped.
:return str: The SQL query to perform on the database session.
"""
corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
return template_query.replace("{slug_name}", slug_name).replace( # type: ignore
"{allowed_corpora_ids}", corpora_ids
) # type: ignore


def get_slugged_objects(
db: Session, slug: str, allowed_corpora: Optional[list[str]] = None
) -> tuple[Optional[str], Optional[str]]:
Expand All @@ -74,14 +55,22 @@ def get_slugged_objects(
:return tuple[Optional[str], Optional[str]]: the FamilyDocument
import id or the Family import_id.
"""
if allowed_corpora is not None:
query_template = get_query_template(
os.path.join("app", "repository", "sql", "slug_lookup.sql")
if allowed_corpora not in [None, []]:
query_template = text(
get_query_template(
os.path.join("app", "repository", "sql", "slug_lookup.sql")
)
)

query_template = query_template.bindparams(
bindparam("slug_name", type_=String),
bindparam(
"allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String)
),
)
query = get_slugged_object_from_allowed_corpora_query(
query_template, slug, allowed_corpora
query = db.execute(
query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora}
)
query = db.execute(query)
else:
query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter(
Slug.name == slug
Expand Down
46 changes: 25 additions & 21 deletions app/repository/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,43 @@

import pandas as pd
from fastapi import Depends
from sqlalchemy import bindparam, text
from sqlalchemy.types import ARRAY, DATETIME, String

from app.clients.db.session import get_db
from app.repository.helpers import get_query_template

_LOGGER = getLogger(__name__)


def create_query(
template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str]
) -> str:
"""Create download whole database query, replacing variables.
def get_whole_database_dump(
ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
):
"""Get whole database dump and bind variables.
:param str ingest_cycle_start: The current ingest cycle date.
:param list[str] allowed_corpora_ids: The corpora from which we
:param list[str] corpora_ids: The corpora from which we
should allow the data to be dumped.
:return str: The SQL query to perform on the database session.
:return pd.DataFrame: A DataFrame containing the results of the SQL
query that gets the whole database dump in our desired format.
"""
corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'"
return template_query.replace( # type: ignore
"{ingest_cycle_start}", ingest_cycle_start
).replace(
"{allowed_corpora_ids}", corpora_ids
) # type: ignore


def get_whole_database_dump(
ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db)
):
query_template = get_query_template(
os.path.join("app", "repository", "sql", "download.sql")
query = text(
get_query_template(os.path.join("app", "repository", "sql", "download.sql"))
).bindparams(
bindparam("ingest_cycle_start", type_=DATETIME),
bindparam(
"allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String)
),
)
query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids)

with db.connection() as conn:
df = pd.read_sql(query, conn.connection)
result = conn.execute(
query,
{
"ingest_cycle_start": ingest_cycle_start,
"allowed_corpora_ids": allowed_corpora_ids,
},
)
columns = result.keys()
df = pd.DataFrame(result.fetchall(), columns=columns)
return df
6 changes: 1 addition & 5 deletions app/repository/helpers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
"""
Functions to support the documents endpoints
old functions (non DFC) are moved to the deprecated_documents.py file.
"""
"""Helper functions for the repository layer."""

from functools import lru_cache

Expand Down
Loading

0 comments on commit 9089898

Please sign in to comment.