diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e716ce..b63cc81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Added + +- `dp lint` command that uses [SQLFluff](https://github.com/sqlfluff/sqlfluff) to lint models and tests + ## [0.19.0] - 2022-04-25 ### Added diff --git a/data_pipelines_cli/cli.py b/data_pipelines_cli/cli.py index c6845e9..fc52a0c 100644 --- a/data_pipelines_cli/cli.py +++ b/data_pipelines_cli/cli.py @@ -9,6 +9,7 @@ from .cli_commands.docs import docs_command from .cli_commands.generate.generate import generate_group from .cli_commands.init import init_command +from .cli_commands.lint import lint_command from .cli_commands.prepare_env import prepare_env_command from .cli_commands.publish import publish_command from .cli_commands.run import run_command @@ -43,6 +44,7 @@ def cli() -> None: _cli.add_command(docs_command) _cli.add_command(generate_group) _cli.add_command(init_command) +_cli.add_command(lint_command) _cli.add_command(list_templates_command) _cli.add_command(prepare_env_command) _cli.add_command(publish_command) diff --git a/data_pipelines_cli/cli_commands/lint.py b/data_pipelines_cli/cli_commands/lint.py new file mode 100644 index 0000000..f85e667 --- /dev/null +++ b/data_pipelines_cli/cli_commands/lint.py @@ -0,0 +1,160 @@ +import pathlib +import tempfile +from configparser import ConfigParser +from typing import Any, List + +import click +import yaml + +from ..cli_constants import BUILD_DIR +from ..cli_utils import echo_info, echo_subinfo, echo_warning, subprocess_run +from ..config_generation import ( + generate_profiles_yml, + read_dictionary_from_config_directory, +) +from ..dbt_utils import read_dbt_vars_from_configs +from ..errors import SQLLintError, SubprocessNonZeroExitError + +SQLFLUFF_FIX_NOT_EVERYTHING_ERROR = 1 +SQLFLUFF_LINT_ERROR = 65 # according to `sqlfluff.core.linter.LintingResult.stats` +SQLFLUFF_DIALECT_LOADING_ERROR = 66 # according to `sqlfluff.cli.commands.get_config` + + +def _get_dialect_or_default() -> str: + """Read ``dbt.yml`` config file and return its ``target_type`` or just the ``ansi``.""" + env, dbt_filename = "base", "dbt.yml" + dbt_env_config = read_dictionary_from_config_directory( + BUILD_DIR.joinpath("dag"), env, dbt_filename + ) or read_dictionary_from_config_directory(pathlib.Path.cwd(), env, dbt_filename) + try: + dialect = dbt_env_config["target_type"] + echo_subinfo(f'Found target_type "{dialect}", attempting to use it as the SQL dialect.') + except KeyError: + dialect = "ansi" + echo_warning( + 'Could not find `target_type` in `dbt.yml`. Using the default SQL dialect ("ansi").' + ) + return dialect + + +def _get_source_tests_paths() -> List[pathlib.Path]: + with open(pathlib.Path.cwd().joinpath("dbt_project.yml"), "r") as f: + dbt_project_config = yaml.safe_load(f) + dir_names: List[str] = ( + dbt_project_config.get("source-paths", []) + + dbt_project_config.get("model-paths", []) + + dbt_project_config.get("test-paths", []) + ) + return list(map(lambda dir_name: pathlib.Path.cwd().joinpath(dir_name), dir_names)) + + +def _insert_into_config_section(config: ConfigParser, section: str, key: str, value: Any) -> None: + if section not in config: + config[section] = {} + config[section][key] = value + + +def _create_temporary_sqlfluff_config(env: str) -> ConfigParser: + sqlfluff_config_path = pathlib.Path.cwd().joinpath(".sqlfluff") + config = ConfigParser() + if sqlfluff_config_path.exists(): + config.read(sqlfluff_config_path) + + _insert_into_config_section(config, "sqlfluff", "templater", "dbt") + _insert_into_config_section( + config, + "sqlfluff:templater:dbt", + "profiles_dir", + str(generate_profiles_yml(env, copy_config_dir=True).absolute()), + ) + config["sqlfluff:templater:dbt:context"] = read_dbt_vars_from_configs(env) + + return config + + +def _run_sqlfluff(command: str, dialect: str, env: str, additional_args: List[str]) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_config_path = pathlib.Path(tmp_dir).joinpath("sqlfluff.config") + with open(tmp_config_path, "w") as tmp_config: + _create_temporary_sqlfluff_config(env).write(tmp_config) + + def sqlfluff_args(sql_dialect: str) -> List[str]: + return [ + "sqlfluff", + command, + "--dialect", + sql_dialect, + "--config", + str(tmp_config_path), + *additional_args, + *map(str, _get_source_tests_paths()), + ] + + try: + subprocess_run(sqlfluff_args(dialect)) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_DIALECT_LOADING_ERROR and dialect != "ansi": + subprocess_run(sqlfluff_args("ansi")) + else: + raise err + + +def _run_fix_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Attempting to fix SQLs. Not every error can be automatically fixed.") + _run_sqlfluff("fix", dialect, env, ["--force"]) + except SubprocessNonZeroExitError as err: + if err.exit_code != SQLFLUFF_FIX_NOT_EVERYTHING_ERROR: + raise err + + +def _run_lint_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Linting SQLs.") + _run_sqlfluff("lint", dialect, env, []) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_LINT_ERROR: + raise SQLLintError + else: + raise err + + +def lint(fix: bool, env: str) -> None: + """ + Lint and format SQL. + + :param fix: Whether to lint and fix linting errors, or just lint. + :type fix: bool + :param env: Name of the environment + :type env: str + """ + echo_info("Linting SQLs:") + dialect = _get_dialect_or_default() + if fix: + _run_fix_sqlfluff(dialect, env) + _run_lint_sqlfluff(dialect, env) + + +@click.command( + name="lint", + short_help="Lint and format SQL", + help="Lint and format SQL using SQLFluff.\n\n" + "For more information on rules and the workings of SQLFluff, " + "refer to https://docs.sqlfluff.com/", +) +@click.option( + "--no-fix", + is_flag=True, + default=False, + type=bool, + help="Whether to lint and fix linting errors, or just lint.", +) +@click.option( + "--env", + default="local", + type=str, + show_default=True, + help="Name of the environment", +) +def lint_command(no_fix: bool, env: str) -> None: + lint(not no_fix, env) diff --git a/data_pipelines_cli/errors.py b/data_pipelines_cli/errors.py index 0e4ef39..f16a77c 100644 --- a/data_pipelines_cli/errors.py +++ b/data_pipelines_cli/errors.py @@ -44,6 +44,8 @@ def __init__(self, project_path: str) -> None: class SubprocessNonZeroExitError(DataPipelinesError): """Exception raised if subprocess exits with non-zero exit code""" + exit_code: int + def __init__( self, subprocess_name: str, exit_code: int, subprocess_output: Optional[str] = None ) -> None: @@ -51,6 +53,7 @@ def __init__( f"{subprocess_name} has exited with non-zero exit code: {exit_code}", submessage=subprocess_output, ) + self.exit_code = exit_code class SubprocessNotFound(DataPipelinesError): @@ -91,3 +94,10 @@ class DockerErrorResponseError(DataPipelinesError): def __init__(self, error_msg: str) -> None: super().__init__("Error raised when using Docker.\n" + error_msg) + + +class SQLLintError(DataPipelinesError): + """Exception raised if there are linting problems in some files.""" + + def __init__(self) -> None: + super().__init__("Fix SQL linting errors.") diff --git a/docs/requirements.txt b/docs/requirements.txt index 6ecdde1..763f946 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,3 +3,5 @@ sphinx-click>=4.0,<4.1 myst-parser>=0.17, <0.18 docutils>=0.17,<0.18 GitPython==3.1.26 +git+https://github.com/swtwsk/copier@regex-update + diff --git a/docs/source/data_pipelines_cli.cli_commands.rst b/docs/source/data_pipelines_cli.cli_commands.rst index a86ebfe..6faebfa 100644 --- a/docs/source/data_pipelines_cli.cli_commands.rst +++ b/docs/source/data_pipelines_cli.cli_commands.rst @@ -64,6 +64,14 @@ data\_pipelines\_cli.cli\_commands.init module :undoc-members: :show-inheritance: +data\_pipelines\_cli.cli\_commands.lint module +---------------------------------------------- + +.. automodule:: data_pipelines_cli.cli_commands.lint + :members: + :undoc-members: + :show-inheritance: + data\_pipelines\_cli.cli\_commands.prepare\_env module ------------------------------------------------------ @@ -119,4 +127,3 @@ data\_pipelines\_cli.cli\_commands.update module :members: :undoc-members: :show-inheritance: - diff --git a/setup.py b/setup.py index d7b0b2f..097a7ae 100644 --- a/setup.py +++ b/setup.py @@ -12,10 +12,11 @@ "questionary==1.10.0", "pyyaml>=5.1, <6.0", "types-PyYAML>=6.0", - "copier==5.1.0", + # due to the 'regex' conflict between copier and sqlfluff: + "copier @ git+https://github.com/swtwsk/copier@regex-update", "Jinja2>=2.11,<2.12", "fsspec", - "packaging>=20.4,<21.0", + "packaging>=20.9,<21.0", ] EXTRA_FILESYSTEMS_REQUIRE = { @@ -33,6 +34,7 @@ "docker": ["docker>=5.0"], "datahub": ["acryl-datahub>=0.8.17, <0.8.18"], "git": ["GitPython==3.1.26"], + "lint": ["sqlfluff>=0.12.0,<1.0", "sqlfluff-templater-dbt>=0.12.0,<1.0"], "tests": [ "pytest>=6.2.2, <7.0.0", "pytest-cov>=2.8.0, <3.0.0", diff --git a/tests/cli_commands/test_lint.py b/tests/cli_commands/test_lint.py new file mode 100644 index 0000000..f723535 --- /dev/null +++ b/tests/cli_commands/test_lint.py @@ -0,0 +1,166 @@ +import pathlib +import shutil +import tempfile +import unittest +from typing import List +from unittest.mock import MagicMock, patch + +import yaml +from click.testing import CliRunner + +from data_pipelines_cli.cli import _cli +from data_pipelines_cli.cli_commands.lint import ( + _get_dialect_or_default, + _get_source_tests_paths, +) +from data_pipelines_cli.errors import SQLLintError, SubprocessNonZeroExitError + +goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") + + +@patch("data_pipelines_cli.cli_commands.lint.generate_profiles_yml", MagicMock()) +class LintCommandTestCase(unittest.TestCase): + def setUp(self) -> None: + self.linted_sqls = [] + self.subprocess_run_args = [] + + self.dbt_project_tmp_dir = pathlib.Path(tempfile.mkdtemp()) + shutil.copyfile( + goldens_dir_path.joinpath("dbt_project.yml"), + self.dbt_project_tmp_dir.joinpath("dbt_project.yml"), + ) + + def tearDown(self) -> None: + shutil.rmtree(self.dbt_project_tmp_dir) + + def _mock_run(self, args: List[str]): + self.subprocess_run_args.append(args) + + def _mock_run_raise_error(self, args: List[str]): + self.subprocess_run_args.append(args) + raise SubprocessNonZeroExitError("sqlfluff", 65) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_with_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run_raise_error + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SQLLintError) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_without_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_fix_sqls(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertTrue(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_unexpected_error(self, subprocess_mock): + for err in [ + ConnectionAbortedError, + FileNotFoundError, + FileExistsError, + KeyError, + FloatingPointError, + ]: + with self.subTest(exception=err), patch( + "pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir) + ): + subprocess_mock.side_effect = err + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, err) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_different_subprocess_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 248) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SubprocessNonZeroExitError) + self.assertEqual(248, result.exception.exit_code) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_wrong_dialect_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 66) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint._get_dialect_or_default", lambda: "some_dialect" + ): + runner = CliRunner() + runner.invoke(_cli, ["lint"]) + self.assertEqual(2, subprocess_mock.call_count) + + +class LintHelperFunctionsTestCase(unittest.TestCase): + def test_get_dialect(self): + build_dir_mock = MagicMock() + build_dir_mock.configure_mock(**{"joinpath": lambda _self, *_args: goldens_dir_path}) + + with patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", build_dir_mock): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: goldens_dir_path) + def test_get_dialect_no_build_dir(self): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: pathlib.Path("/a/b/c/d/e/f")) + def test_default_dialect(self): + self.assertEqual("ansi", _get_dialect_or_default()) + + def test_get_source_tests_paths(self): + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "pathlib.Path.cwd", lambda: pathlib.Path(tmp_dir) + ): + with open(goldens_dir_path.joinpath("dbt_project.yml"), "r") as orig_dbt, open( + pathlib.Path(tmp_dir).joinpath("dbt_project.yml"), "w" + ) as tmp_dbt: + dbt_project = yaml.safe_load(orig_dbt) + dbt_project["source-paths"] = ["models", "models2", "models3"] + yaml.dump(dbt_project, tmp_dbt) + + self.assertSetEqual( + { + pathlib.Path(tmp_dir).joinpath(dir_name) + for dir_name in ["models", "models2", "models3", "tests"] + }, + set(_get_source_tests_paths()), + )