From 0a0ca374c9a8cc2ef5359b62c00b1205907ed563 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 15:04:25 +0530 Subject: [PATCH 01/10] feat: enhance pre-commit hook with validation and detailed setup instructions - Updated the pre-commit hook description for clarity. - Added validation for the configuration file in the executor hook to ensure it exists and is not empty. - Expanded README and documentation to include detailed setup instructions for the pre-commit hook, including configuration options and best practices. --- .pre-commit-hooks.yaml | 8 +- README.md | 32 ++++ docs/hooks.rst | 148 ++++++++++++++++-- .../core/platforms/dbt/hooks/executor_hook.py | 136 +++++++++++----- 4 files changed, 271 insertions(+), 53 deletions(-) diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index af5db240..71054597 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -1,7 +1,13 @@ - id: datapilot_run_dbt_checks name: datapilot run dbt checks - description: datapilot run dbt checks + description: Run DataPilot dbt project health checks on changed files entry: datapilot_run_dbt_checks language: python types_or: [yaml, sql] require_serial: true + # Optional arguments that can be passed to the hook: + # --config-path: Path to configuration file + # --token: API token for authentication + # --instance-name: Tenant/instance name + # --backend-url: Backend URL (defaults to https://api.myaltimate.com) + # --config-name: Name of config to use from API diff --git a/README.md b/README.md index 8cc7882e..cfb201c2 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,38 @@ The [--config-path] is an optional argument. You can provide a yaml file with ov Note: The dbt docs generate requires an active database connection and may take a long time for projects with large number of models. +### Pre-commit Hook Integration + +DataPilot provides a pre-commit hook that automatically runs health checks on changed files before each commit. This ensures code quality and catches issues early in the development process. + +#### Quick Setup + +1. Install pre-commit: +```bash +pip install pre-commit +``` + +2. Add to your `.pre-commit-config.yaml`: +```yaml +repos: + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.23 # Always use a specific version tag + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}" + ] +``` + +3. Install the hook: +```bash +pre-commit install +``` + +For detailed setup instructions, see the [Pre-commit Hook Setup Guide](docs/pre-commit-setup.md). + ### Checks The following checks are available: diff --git a/docs/hooks.rst b/docs/hooks.rst index 9a86c002..fe07ca92 100644 --- a/docs/hooks.rst +++ b/docs/hooks.rst @@ -11,37 +11,153 @@ To use the DataPilot pre-commit hook, follow these steps: 1. Install the `pre-commit` package if you haven't already: -``` -pip install pre-commit -``` +.. code-block:: shell + + pip install pre-commit 2. Add the following configuration to your .pre-commit-config.yaml file in the root of your repository: -``` +.. code-block:: yaml + repos: - - repo: https://github.com/AltimateAI/datapilot-cli - rev: - hooks: - - id: datapilot_run_dbt_checks - args: ["--config-path", "path/to/your/config/file"] -``` + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.23 # Use a specific version tag, not 'main' + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}" + ] + +Configuration Options +--------------------- + +The DataPilot pre-commit hook supports several configuration options: + +**Required Configuration:** + +- ``rev``: Always use a specific version tag (e.g., ``v0.0.23``) instead of ``main`` for production stability + +**Optional Arguments:** + +- ``--config-path``: Path to your DataPilot configuration file +- ``--token``: Your API token for authentication (can use environment variables) +- ``--instance-name``: Your tenant/instance name (can use environment variables) +- ``--backend-url``: Backend URL (defaults to https://api.myaltimate.com) +- ``--config-name``: Name of config to use from API +- ``--base-path``: Base path of the dbt project (defaults to current directory) + +**Environment Variables:** + +You can use environment variables for sensitive information: + +.. code-block:: yaml + + repos: + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.23 + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}" + ] + +**Configuration File Example:** + +Create a ``datapilot-config.yaml`` file in your project root: + +.. code-block:: yaml + + # DataPilot Configuration + disabled_insights: + - "hard_coded_references" + - "duplicate_sources" -Replace with the desired revision of the DataPilot repository and "path/to/your/config/file" with the path to your configuration file. + # Custom settings for your project + project_settings: + max_fanout: 10 + require_tests: true 3. Install the pre-commit hook: -``` -pre-commit install -``` +.. code-block:: shell + + pre-commit install Usage ----- -Once the hook is installed, it will run automatically before each commit. If any issues are detected, the commit will be aborted, and you will be prompted to fix the issues before retrying the commit. +Once the hook is installed, it will run automatically before each commit. The hook will: + +1. **Validate Configuration**: Check that your config file exists and is valid +2. **Authenticate**: Use your provided token and instance name to authenticate +3. **Analyze Changes**: Only analyze files that have changed in the commit +4. **Report Issues**: Display any issues found and prevent the commit if problems are detected + +**Manual Execution:** + +To manually run all pre-commit hooks on a repository: + +.. code-block:: shell + + pre-commit run --all-files + +To run individual hooks: + +.. code-block:: shell + pre-commit run datapilot_run_dbt_checks -If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run `. +**Troubleshooting:** +- **Authentication Issues**: Ensure your token and instance name are correctly set +- **Empty Config Files**: The hook will fail if your config file is empty or invalid +- **No Changes**: If no relevant files have changed, the hook will skip execution +- **Network Issues**: Ensure you have access to the DataPilot API + +Best Practices +------------- + +1. **Use Version Tags**: Always specify a version tag in the ``rev`` field, never use ``main`` +2. **Environment Variables**: Use environment variables for sensitive information like tokens +3. **Configuration Files**: Create a dedicated config file for your project settings +4. **Regular Updates**: Update to new versions when they become available +5. **Team Coordination**: Ensure all team members use the same configuration + +Example Complete Setup +--------------------- + +Here's a complete example of a ``.pre-commit-config.yaml`` file: + +.. code-block:: yaml + + # .pre-commit-config.yaml + exclude: '^(\.tox|ci/templates|\.bumpversion\.cfg)(/|$)' + + repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.14 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --show-fixes] + + - repo: https://github.com/psf/black + rev: 23.12.1 + hooks: + - id: black + + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.23 + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}" + ] Feedback and Contributions -------------------------- diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index ff5d7d78..cfffe950 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -1,5 +1,7 @@ import argparse +import sys import time +from pathlib import Path from typing import Optional from typing import Sequence @@ -13,6 +15,23 @@ from datapilot.utils.utils import generate_partial_manifest_catalog +def validate_config_file(config_path: str) -> bool: + """Validate that the config file exists and is not empty.""" + if not Path(config_path).exists(): + print(f"Error: Config file '{config_path}' does not exist.", file=sys.stderr) + return False + + try: + config = load_config(config_path) + if not config: + print(f"Error: Config file '{config_path}' is empty or invalid.", file=sys.stderr) + return False + return True + except Exception as e: + print(f"Error: Failed to load config file '{config_path}': {e}", file=sys.stderr) + return False + + def main(argv: Optional[Sequence[str]] = None): start_time = time.time() parser = argparse.ArgumentParser() @@ -28,58 +47,103 @@ def main(argv: Optional[Sequence[str]] = None): help="Base path of the dbt project", ) + parser.add_argument( + "--token", + help="Your API token for authentication.", + ) + + parser.add_argument( + "--instance-name", + help="Your tenant ID.", + ) + + parser.add_argument("--backend-url", help="Altimate's Backend URL", default="https://api.myaltimate.com") + + parser.add_argument( + "--config-name", + help="Name of the DBT config to use from the API", + ) + args = parser.parse_known_args(argv) - # print(f"args: {args}", file=sys.__stdout__) + + # Validate config file if provided config = {} if hasattr(args[0], "config_path") and args[0].config_path: - # print(f"Using config file: {args[0].config_path[0]}") - config = load_config(args[0].config_path[0]) + config_path = args[0].config_path[0] + if not validate_config_file(config_path): + print("Pre-commit hook failed: Invalid config file.", file=sys.stderr) + sys.exit(1) + config = load_config(config_path) base_path = "./" if hasattr(args[0], "base_path") and args[0].base_path: base_path = args[0].base_path[0] + # Get authentication parameters + token = getattr(args[0], "token", None) + instance_name = getattr(args[0], "instance_name", None) + backend_url = getattr(args[0], "backend_url", "https://api.myaltimate.com") + + # Validate authentication parameters + if not token: + print("Warning: No API token provided. Using default configuration.", file=sys.stderr) + print("To specify a token, use: --token 'your-token'", file=sys.stderr) + + if not instance_name: + print("Warning: No instance name provided. Using default configuration.", file=sys.stderr) + print("To specify an instance, use: --instance-name 'your-instance'", file=sys.stderr) + changed_files = args[1] - # print(f"Changed files: {changed_files}") if not changed_files: - # print("No changed files detected - test. Exiting...") + print("No changed files detected. Skipping datapilot checks.", file=sys.stderr) return - # print(f"Changed files: {changed_files}", file=sys.__stdout__) - selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) - # print("se1ected models", selected_models, file=sys.__stdout__) - insight_generator = DBTInsightGenerator( - manifest=manifest, - catalog=catalog, - config=config, - selected_model_ids=selected_models, - ) - reports = insight_generator.run() - if reports: - model_report = generate_model_insights_table(reports[MODEL]) - if len(model_report) > 0: - print("--" * 50) - print("Model Insights") - print("--" * 50) - for model_id, report in model_report.items(): - print(f"Model: {model_id}") - print(f"File path: {report['path']}") - print(tabulate_data(report["table"], headers="keys")) - print("\n") - - project_report = generate_project_insights_table(reports[PROJECT]) - if len(project_report) > 0: - print("--" * 50) - print("Project Insights") - print("--" * 50) - print(tabulate_data(project_report, headers="keys")) - - exit(1) + try: + selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) + + insight_generator = DBTInsightGenerator( + manifest=manifest, + catalog=catalog, + config=config, + selected_model_ids=selected_models, + token=token, + instance_name=instance_name, + backend_url=backend_url, + ) + + reports = insight_generator.run() + + if reports: + model_report = generate_model_insights_table(reports[MODEL]) + if len(model_report) > 0: + print("--" * 50, file=sys.stderr) + print("Model Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + for model_id, report in model_report.items(): + print(f"Model: {model_id}", file=sys.stderr) + print(f"File path: {report['path']}", file=sys.stderr) + print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) + print("\n", file=sys.stderr) + + project_report = generate_project_insights_table(reports[PROJECT]) + if len(project_report) > 0: + print("--" * 50, file=sys.stderr) + print("Project Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + print(tabulate_data(project_report, headers="keys"), file=sys.stderr) + + print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) + sys.exit(1) + + except Exception as e: + print(f"Error running DataPilot checks: {e}", file=sys.stderr) + print("Pre-commit hook failed due to an error.", file=sys.stderr) + sys.exit(1) end_time = time.time() total_time = end_time - start_time - print(f"Total time taken: {round(total_time, 2)} seconds") + print(f"DataPilot checks completed successfully in {round(total_time, 2)} seconds", file=sys.stderr) if __name__ == "__main__": From 07b97fdb50ee7dc2ca4b07285292d2509bf5c67c Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 15:32:59 +0530 Subject: [PATCH 02/10] feat: enhance pre-commit hook with improved config handling and logging - Added functionality to load configuration from both file and API, with detailed error handling. - Introduced logging statements to provide feedback during the execution of the pre-commit hook. - Improved handling of changed files and insights generation process, ensuring better visibility into operations. --- .../core/platforms/dbt/hooks/executor_hook.py | 63 ++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index cfffe950..a51e260c 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -5,6 +5,7 @@ from typing import Optional from typing import Sequence +from datapilot.clients.altimate.utils import get_all_dbt_configs from datapilot.config.config import load_config from datapilot.core.platforms.dbt.constants import MODEL from datapilot.core.platforms.dbt.constants import PROJECT @@ -34,6 +35,8 @@ def validate_config_file(config_path: str) -> bool: def main(argv: Optional[Sequence[str]] = None): start_time = time.time() + print("Starting DataPilot pre-commit hook...", file=sys.stderr) + parser = argparse.ArgumentParser() parser.add_argument( "--config-path", @@ -66,18 +69,60 @@ def main(argv: Optional[Sequence[str]] = None): args = parser.parse_known_args(argv) - # Validate config file if provided - config = {} + # Handle config loading like in project_health command + config = None if hasattr(args[0], "config_path") and args[0].config_path: config_path = args[0].config_path[0] + print(f"Loading config from file: {config_path}", file=sys.stderr) if not validate_config_file(config_path): print("Pre-commit hook failed: Invalid config file.", file=sys.stderr) sys.exit(1) config = load_config(config_path) + print("Config loaded successfully from file", file=sys.stderr) + elif ( + hasattr(args[0], "config_name") + and args[0].config_name + and hasattr(args[0], "token") + and args[0].token + and hasattr(args[0], "instance_name") + and args[0].instance_name + ): + config_name = args[0].config_name + token = args[0].token + instance_name = args[0].instance_name + backend_url = getattr(args[0], "backend_url", "https://api.myaltimate.com") + + print(f"Fetching config '{config_name}' from API...", file=sys.stderr) + try: + # Get configs from API + configs = get_all_dbt_configs(token, instance_name, backend_url) + if configs and "items" in configs: + # Find config by name + matching_configs = [c for c in configs["items"] if c["name"] == config_name] + if matching_configs: + # Get the config directly from the API response + print(f"Using config from API: {config_name}", file=sys.stderr) + config = matching_configs[0].get("config", {}) + else: + print(f"No config found with name: {config_name}", file=sys.stderr) + print("Pre-commit hook failed: Config not found.", file=sys.stderr) + sys.exit(1) + else: + print("Failed to fetch configs from API", file=sys.stderr) + print("Pre-commit hook failed: Unable to fetch configs.", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error fetching config from API: {e}", file=sys.stderr) + print("Pre-commit hook failed: API error.", file=sys.stderr) + sys.exit(1) + else: + print("No config provided. Using default configuration.", file=sys.stderr) + config = {} base_path = "./" if hasattr(args[0], "base_path") and args[0].base_path: base_path = args[0].base_path[0] + print(f"Using base path: {base_path}", file=sys.stderr) # Get authentication parameters token = getattr(args[0], "token", None) @@ -99,9 +144,19 @@ def main(argv: Optional[Sequence[str]] = None): print("No changed files detected. Skipping datapilot checks.", file=sys.stderr) return + print(f"Processing {len(changed_files)} changed files...", file=sys.stderr) + print(f"Changed files: {', '.join(changed_files)}", file=sys.stderr) + try: + print("Generating partial manifest and catalog from changed files...", file=sys.stderr) selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) + print(f"Generated manifest with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) + if catalog: + print(f"Generated catalog with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) + else: + print("No catalog generated (catalog file not available)", file=sys.stderr) + print("Initializing DBT Insight Generator...", file=sys.stderr) insight_generator = DBTInsightGenerator( manifest=manifest, catalog=catalog, @@ -112,9 +167,11 @@ def main(argv: Optional[Sequence[str]] = None): backend_url=backend_url, ) + print("Running insight generation...", file=sys.stderr) reports = insight_generator.run() if reports: + print("Insights generated successfully. Analyzing results...", file=sys.stderr) model_report = generate_model_insights_table(reports[MODEL]) if len(model_report) > 0: print("--" * 50, file=sys.stderr) @@ -135,6 +192,8 @@ def main(argv: Optional[Sequence[str]] = None): print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) sys.exit(1) + else: + print("No insights generated. All checks passed!", file=sys.stderr) except Exception as e: print(f"Error running DataPilot checks: {e}", file=sys.stderr) From 0e533cf5f0d66c24c9ac6ee9cfdd2fc14c29bcf8 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 15:36:51 +0530 Subject: [PATCH 03/10] Fix manifest object handling in executor hook logging --- .../core/platforms/dbt/hooks/executor_hook.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index a51e260c..0b222624 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -150,9 +150,23 @@ def main(argv: Optional[Sequence[str]] = None): try: print("Generating partial manifest and catalog from changed files...", file=sys.stderr) selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) - print(f"Generated manifest with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) + + # Handle manifest object (could be ManifestV12 or similar) + if hasattr(manifest, "nodes"): + print(f"Generated manifest with {len(manifest.nodes)} nodes", file=sys.stderr) + elif hasattr(manifest, "get") and callable(manifest.get): + print(f"Generated manifest with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) + else: + print(f"Generated manifest object of type: {type(manifest).__name__}", file=sys.stderr) + + # Handle catalog object (could be CatalogV1 or similar) if catalog: - print(f"Generated catalog with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) + if hasattr(catalog, "nodes"): + print(f"Generated catalog with {len(catalog.nodes)} nodes", file=sys.stderr) + elif hasattr(catalog, "get") and callable(catalog.get): + print(f"Generated catalog with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) + else: + print(f"Generated catalog object of type: {type(catalog).__name__}", file=sys.stderr) else: print("No catalog generated (catalog file not available)", file=sys.stderr) From 158a7c70f8fd5474b4a3fea123020923ea60676e Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 15:39:25 +0530 Subject: [PATCH 04/10] Enhance logging in executor hook to include config ID when using API --- src/datapilot/core/platforms/dbt/hooks/executor_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index 0b222624..3631d103 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -101,7 +101,7 @@ def main(argv: Optional[Sequence[str]] = None): matching_configs = [c for c in configs["items"] if c["name"] == config_name] if matching_configs: # Get the config directly from the API response - print(f"Using config from API: {config_name}", file=sys.stderr) + print(f"Using config from API: {config_name} id: {matching_configs[0]['id']}", file=sys.stderr) config = matching_configs[0].get("config", {}) else: print(f"No config found with name: {config_name}", file=sys.stderr) From 8b6829b768fb9f708ac09e76f7a746cbb675943f Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 16:08:11 +0530 Subject: [PATCH 05/10] Update logging in executor hook to clarify config ID output --- src/datapilot/core/platforms/dbt/hooks/executor_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index 3631d103..1804bbeb 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -101,7 +101,7 @@ def main(argv: Optional[Sequence[str]] = None): matching_configs = [c for c in configs["items"] if c["name"] == config_name] if matching_configs: # Get the config directly from the API response - print(f"Using config from API: {config_name} id: {matching_configs[0]['id']}", file=sys.stderr) + print(f"Using config from API: {config_name} Config ID: {matching_configs[0]['id']}", file=sys.stderr) config = matching_configs[0].get("config", {}) else: print(f"No config found with name: {config_name}", file=sys.stderr) From aec9ae3f97426c9032f940b01f0646c09eacf918 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 16:13:01 +0530 Subject: [PATCH 06/10] Fix executor hook to only fail when actual issues are found --- .../core/platforms/dbt/hooks/executor_hook.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index 1804bbeb..5b5e08d0 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -187,25 +187,34 @@ def main(argv: Optional[Sequence[str]] = None): if reports: print("Insights generated successfully. Analyzing results...", file=sys.stderr) model_report = generate_model_insights_table(reports[MODEL]) - if len(model_report) > 0: + project_report = generate_project_insights_table(reports[PROJECT]) + + # Check if there are actual issues found + has_model_issues = len(model_report) > 0 + has_project_issues = len(project_report) > 0 + + if has_model_issues: print("--" * 50, file=sys.stderr) print("Model Insights", file=sys.stderr) print("--" * 50, file=sys.stderr) - for model_id, report in model_report.items(): - print(f"Model: {model_id}", file=sys.stderr) - print(f"File path: {report['path']}", file=sys.stderr) - print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) - print("\n", file=sys.stderr) + for model_id, report in model_report.items(): + print(f"Model: {model_id}", file=sys.stderr) + print(f"File path: {report['path']}", file=sys.stderr) + print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) + print("\n", file=sys.stderr) - project_report = generate_project_insights_table(reports[PROJECT]) - if len(project_report) > 0: + if has_project_issues: print("--" * 50, file=sys.stderr) print("Project Insights", file=sys.stderr) print("--" * 50, file=sys.stderr) print(tabulate_data(project_report, headers="keys"), file=sys.stderr) - print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) - sys.exit(1) + # Only fail if there are actual issues + if has_model_issues or has_project_issues: + print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) + sys.exit(1) + else: + print("All checks passed! No issues found.", file=sys.stderr) else: print("No insights generated. All checks passed!", file=sys.stderr) From 2b985afa26702bc7faa6921b0b4e3fe5614152b5 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Wed, 20 Aug 2025 21:55:32 +0530 Subject: [PATCH 07/10] Fix pre-commit hook: load manifest/catalog from files instead of generating partial ones --- .../core/platforms/dbt/hooks/executor_hook.py | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index 5b5e08d0..76ce9339 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -12,8 +12,9 @@ from datapilot.core.platforms.dbt.executor import DBTInsightGenerator from datapilot.core.platforms.dbt.formatting import generate_model_insights_table from datapilot.core.platforms.dbt.formatting import generate_project_insights_table +from datapilot.core.platforms.dbt.utils import load_catalog +from datapilot.core.platforms.dbt.utils import load_manifest from datapilot.utils.formatting.utils import tabulate_data -from datapilot.utils.utils import generate_partial_manifest_catalog def validate_config_file(config_path: str) -> bool: @@ -67,6 +68,16 @@ def main(argv: Optional[Sequence[str]] = None): help="Name of the DBT config to use from the API", ) + parser.add_argument( + "--manifest-path", + help="Path to the DBT manifest file (defaults to ./target/manifest.json)", + ) + + parser.add_argument( + "--catalog-path", + help="Path to the DBT catalog file (defaults to ./target/catalog.json)", + ) + args = parser.parse_known_args(argv) # Handle config loading like in project_health command @@ -138,44 +149,77 @@ def main(argv: Optional[Sequence[str]] = None): print("Warning: No instance name provided. Using default configuration.", file=sys.stderr) print("To specify an instance, use: --instance-name 'your-instance'", file=sys.stderr) - changed_files = args[1] + # Determine manifest and catalog paths + manifest_path = getattr(args[0], "manifest_path", None) + catalog_path = getattr(args[0], "catalog_path", None) - if not changed_files: - print("No changed files detected. Skipping datapilot checks.", file=sys.stderr) - return + # Set default paths if not provided + if not manifest_path: + manifest_path = str(Path(base_path) / "target" / "manifest.json") + print(f"Using default manifest path: {manifest_path}", file=sys.stderr) + else: + print(f"Using provided manifest path: {manifest_path}", file=sys.stderr) - print(f"Processing {len(changed_files)} changed files...", file=sys.stderr) - print(f"Changed files: {', '.join(changed_files)}", file=sys.stderr) + if not catalog_path: + catalog_path = str(Path(base_path) / "target" / "catalog.json") + print(f"Using default catalog path: {catalog_path}", file=sys.stderr) + else: + print(f"Using provided catalog path: {catalog_path}", file=sys.stderr) + # Load manifest + print("Loading manifest file...", file=sys.stderr) try: - print("Generating partial manifest and catalog from changed files...", file=sys.stderr) - selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) - - # Handle manifest object (could be ManifestV12 or similar) + manifest = load_manifest(manifest_path) if hasattr(manifest, "nodes"): - print(f"Generated manifest with {len(manifest.nodes)} nodes", file=sys.stderr) + print(f"Manifest loaded successfully with {len(manifest.nodes)} nodes", file=sys.stderr) elif hasattr(manifest, "get") and callable(manifest.get): - print(f"Generated manifest with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) + print(f"Manifest loaded successfully with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) else: - print(f"Generated manifest object of type: {type(manifest).__name__}", file=sys.stderr) + print(f"Manifest loaded successfully, object type: {type(manifest).__name__}", file=sys.stderr) + except Exception as e: + print(f"Error loading manifest from {manifest_path}: {e}", file=sys.stderr) + print("Pre-commit hook failed: Unable to load manifest file.", file=sys.stderr) + sys.exit(1) - # Handle catalog object (could be CatalogV1 or similar) - if catalog: + # Load catalog if available + catalog = None + if Path(catalog_path).exists(): + print("Loading catalog file...", file=sys.stderr) + try: + catalog = load_catalog(catalog_path) if hasattr(catalog, "nodes"): - print(f"Generated catalog with {len(catalog.nodes)} nodes", file=sys.stderr) + print(f"Catalog loaded successfully with {len(catalog.nodes)} nodes", file=sys.stderr) elif hasattr(catalog, "get") and callable(catalog.get): - print(f"Generated catalog with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) + print(f"Catalog loaded successfully with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) else: - print(f"Generated catalog object of type: {type(catalog).__name__}", file=sys.stderr) - else: - print("No catalog generated (catalog file not available)", file=sys.stderr) + print(f"Catalog loaded successfully, object type: {type(catalog).__name__}", file=sys.stderr) + except Exception as e: + print(f"Warning: Error loading catalog from {catalog_path}: {e}", file=sys.stderr) + print("Continuing without catalog...", file=sys.stderr) + catalog = None + else: + print(f"Catalog file not found at {catalog_path}, continuing without catalog", file=sys.stderr) + # Get changed files for selective model testing + changed_files = args[1] + selected_models = [] + + if changed_files: + print(f"Processing {len(changed_files)} changed files for selective testing...", file=sys.stderr) + print(f"Changed files: {', '.join(changed_files)}", file=sys.stderr) + # Extract model names from changed files for selective testing + # This could be enhanced to map file paths to model names + selected_models = changed_files + else: + print("No changed files detected. Running checks on all models.", file=sys.stderr) + + try: print("Initializing DBT Insight Generator...", file=sys.stderr) insight_generator = DBTInsightGenerator( manifest=manifest, catalog=catalog, config=config, - selected_model_ids=selected_models, + selected_models=selected_models, token=token, instance_name=instance_name, backend_url=backend_url, From c31fb1d7cf59f1d8f1c780de3bfd12e4fdea29a7 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Thu, 21 Aug 2025 11:45:23 +0530 Subject: [PATCH 08/10] Update pre-commit hook configuration to version v0.0.27 and enhance documentation - Updated the pre-commit hook version to v0.0.27 in configuration files. - Added new optional arguments for manifest and catalog file paths in the README and documentation. - Clarified requirements for DBT artifacts in the documentation to improve user guidance. --- .pre-commit-hooks.yaml | 3 +++ README.md | 6 ++++-- docs/hooks.rst | 34 ++++++++++++++++++++++++++-------- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 71054597..32d7303c 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -11,3 +11,6 @@ # --instance-name: Tenant/instance name # --backend-url: Backend URL (defaults to https://api.myaltimate.com) # --config-name: Name of config to use from API + # --manifest-path: Path to DBT manifest file (defaults to ./target/manifest.json) + # --catalog-path: Path to DBT catalog file (defaults to ./target/catalog.json) + # --base-path: Base path of the dbt project (defaults to current directory) diff --git a/README.md b/README.md index cfb201c2..dc8b4b53 100644 --- a/README.md +++ b/README.md @@ -57,13 +57,15 @@ pip install pre-commit ```yaml repos: - repo: https://github.com/AltimateAI/datapilot-cli - rev: v0.0.23 # Always use a specific version tag + rev: v0.0.27 # Always use a specific version tag hooks: - id: datapilot_run_dbt_checks args: [ "--config-path", "./datapilot-config.yaml", "--token", "${DATAPILOT_TOKEN}", - "--instance-name", "${DATAPILOT_INSTANCE}" + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" ] ``` diff --git a/docs/hooks.rst b/docs/hooks.rst index fe07ca92..0e1d03fa 100644 --- a/docs/hooks.rst +++ b/docs/hooks.rst @@ -21,7 +21,7 @@ To use the DataPilot pre-commit hook, follow these steps: repos: - repo: https://github.com/AltimateAI/datapilot-cli - rev: v0.0.23 # Use a specific version tag, not 'main' + rev: v0.0.27 # Use a specific version tag, not 'main' hooks: - id: datapilot_run_dbt_checks args: [ @@ -37,7 +37,7 @@ The DataPilot pre-commit hook supports several configuration options: **Required Configuration:** -- ``rev``: Always use a specific version tag (e.g., ``v0.0.23``) instead of ``main`` for production stability +- ``rev``: Always use a specific version tag (e.g., ``v0.0.27``) instead of ``main`` for production stability **Optional Arguments:** @@ -47,6 +47,8 @@ The DataPilot pre-commit hook supports several configuration options: - ``--backend-url``: Backend URL (defaults to https://api.myaltimate.com) - ``--config-name``: Name of config to use from API - ``--base-path``: Base path of the dbt project (defaults to current directory) +- ``--manifest-path``: Path to the DBT manifest file (defaults to {base_path}/target/manifest.json) +- ``--catalog-path``: Path to the DBT catalog file (defaults to {base_path}/target/catalog.json) **Environment Variables:** @@ -56,13 +58,15 @@ You can use environment variables for sensitive information: repos: - repo: https://github.com/AltimateAI/datapilot-cli - rev: v0.0.23 + rev: v0.0.27 hooks: - id: datapilot_run_dbt_checks args: [ "--config-path", "./datapilot-config.yaml", "--token", "${DATAPILOT_TOKEN}", - "--instance-name", "${DATAPILOT_INSTANCE}" + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" ] **Configuration File Example:** @@ -94,8 +98,18 @@ Once the hook is installed, it will run automatically before each commit. The ho 1. **Validate Configuration**: Check that your config file exists and is valid 2. **Authenticate**: Use your provided token and instance name to authenticate -3. **Analyze Changes**: Only analyze files that have changed in the commit -4. **Report Issues**: Display any issues found and prevent the commit if problems are detected +3. **Load DBT Artifacts**: Load manifest and catalog files for analysis +4. **Analyze Changes**: Only analyze files that have changed in the commit +5. **Report Issues**: Display any issues found and prevent the commit if problems are detected + +**Required DBT Artifacts:** + +The pre-commit hook requires DBT manifest and catalog files to function properly: + +- **Manifest File**: Generated by running `dbt compile` or `dbt run`. Default location: `./target/manifest.json` +- **Catalog File**: Generated by running `dbt docs generate`. Default location: `./target/catalog.json` + +**Note**: The catalog file is optional but recommended for comprehensive analysis. If not available, the hook will continue without catalog information. **Manual Execution:** @@ -115,6 +129,8 @@ To run individual hooks: - **Authentication Issues**: Ensure your token and instance name are correctly set - **Empty Config Files**: The hook will fail if your config file is empty or invalid +- **Missing Manifest File**: Ensure you have run `dbt compile` or `dbt run` to generate the manifest.json file +- **Missing Catalog File**: Run `dbt docs generate` to create the catalog.json file (optional but recommended) - **No Changes**: If no relevant files have changed, the hook will skip execution - **Network Issues**: Ensure you have access to the DataPilot API @@ -150,13 +166,15 @@ Here's a complete example of a ``.pre-commit-config.yaml`` file: - id: black - repo: https://github.com/AltimateAI/datapilot-cli - rev: v0.0.23 + rev: v0.0.27 hooks: - id: datapilot_run_dbt_checks args: [ "--config-path", "./datapilot-config.yaml", "--token", "${DATAPILOT_TOKEN}", - "--instance-name", "${DATAPILOT_INSTANCE}" + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" ] Feedback and Contributions From 3c891cbd90c943f01ce7affa1ad21b6846884278 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Thu, 21 Aug 2025 12:21:29 +0530 Subject: [PATCH 09/10] Refactor executor hook for improved configuration handling and logging - Introduced functions to load configuration from both file and API, enhancing error handling and user feedback. - Added new command-line arguments for better flexibility in specifying paths and configurations. - Streamlined the process of loading manifest and catalog files, with improved logging for insights generation. - Enhanced the handling of changed files for selective model testing, ensuring clarity in output and error messages. --- .../core/platforms/dbt/hooks/executor_hook.py | 401 ++++++++++-------- 1 file changed, 219 insertions(+), 182 deletions(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index 76ce9339..e1e687d4 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Optional from typing import Sequence +from typing import Tuple from datapilot.clients.altimate.utils import get_all_dbt_configs from datapilot.config.config import load_config @@ -17,6 +18,17 @@ from datapilot.utils.formatting.utils import tabulate_data +def get_config(name: str, matching_configs: list) -> dict: + """Extract config from matching configs by name.""" + if matching_configs: + print(f"Using config from API: {name} Config ID: {matching_configs[0]['id']}", file=sys.stderr) + return matching_configs[0].get("config", {}) + else: + print(f"No config found with name: {name}", file=sys.stderr) + print("Pre-commit hook failed: Config not found.", file=sys.stderr) + sys.exit(1) + + def validate_config_file(config_path: str) -> bool: """Validate that the config file exists and is not empty.""" if not Path(config_path).exists(): @@ -34,126 +46,80 @@ def validate_config_file(config_path: str) -> bool: return False -def main(argv: Optional[Sequence[str]] = None): - start_time = time.time() - print("Starting DataPilot pre-commit hook...", file=sys.stderr) - +def setup_argument_parser() -> argparse.ArgumentParser: + """Set up and return the argument parser.""" parser = argparse.ArgumentParser() - parser.add_argument( - "--config-path", - nargs="*", - help="Path of the config file to be used for the insight generation", - ) + parser.add_argument("--config-path", nargs="*", help="Path of the config file to be used for the insight generation") + parser.add_argument("--base-path", nargs="*", help="Base path of the dbt project") + parser.add_argument("--token", help="Your API token for authentication.") + parser.add_argument("--instance-name", help="Your tenant ID.") + parser.add_argument("--backend-url", help="Altimate's Backend URL", default="https://api.myaltimate.com") + parser.add_argument("--config-name", help="Name of the DBT config to use from the API") + parser.add_argument("--manifest-path", help="Path to the DBT manifest file (defaults to ./target/manifest.json)") + parser.add_argument("--catalog-path", help="Path to the DBT catalog file (defaults to ./target/catalog.json)") + return parser - parser.add_argument( - "--base-path", - nargs="*", - help="Base path of the dbt project", - ) - parser.add_argument( - "--token", - help="Your API token for authentication.", - ) +def extract_arguments(args) -> Tuple[str, str, str, str, str, str, str]: + """Extract and return common arguments from parsed args.""" + config_name = getattr(args, "config_name", None) + token = getattr(args, "token", None) + instance_name = getattr(args, "instance_name", None) + backend_url = getattr(args, "backend_url", "https://api.myaltimate.com") - parser.add_argument( - "--instance-name", - help="Your tenant ID.", - ) + # Extract config_path and base_path (they are lists) + config_path = args.config_path[0] if args.config_path else None + base_path = args.base_path[0] if args.base_path else "./" - parser.add_argument("--backend-url", help="Altimate's Backend URL", default="https://api.myaltimate.com") + # Extract manifest and catalog paths + manifest_path = getattr(args, "manifest_path", None) + catalog_path = getattr(args, "catalog_path", None) - parser.add_argument( - "--config-name", - help="Name of the DBT config to use from the API", - ) + return config_name, token, instance_name, backend_url, config_path, base_path, manifest_path, catalog_path - parser.add_argument( - "--manifest-path", - help="Path to the DBT manifest file (defaults to ./target/manifest.json)", - ) - parser.add_argument( - "--catalog-path", - help="Path to the DBT catalog file (defaults to ./target/catalog.json)", - ) +def load_config_from_file(config_path: str) -> dict: + """Load configuration from a file.""" + print(f"Loading config from file: {config_path}", file=sys.stderr) + if not validate_config_file(config_path): + print("Pre-commit hook failed: Invalid config file.", file=sys.stderr) + sys.exit(1) + config = load_config(config_path) + print("Config loaded successfully from file", file=sys.stderr) + return config - args = parser.parse_known_args(argv) - # Handle config loading like in project_health command - config = None - if hasattr(args[0], "config_path") and args[0].config_path: - config_path = args[0].config_path[0] - print(f"Loading config from file: {config_path}", file=sys.stderr) - if not validate_config_file(config_path): - print("Pre-commit hook failed: Invalid config file.", file=sys.stderr) - sys.exit(1) - config = load_config(config_path) - print("Config loaded successfully from file", file=sys.stderr) - elif ( - hasattr(args[0], "config_name") - and args[0].config_name - and hasattr(args[0], "token") - and args[0].token - and hasattr(args[0], "instance_name") - and args[0].instance_name - ): - config_name = args[0].config_name - token = args[0].token - instance_name = args[0].instance_name - backend_url = getattr(args[0], "backend_url", "https://api.myaltimate.com") - - print(f"Fetching config '{config_name}' from API...", file=sys.stderr) - try: - # Get configs from API - configs = get_all_dbt_configs(token, instance_name, backend_url) - if configs and "items" in configs: - # Find config by name - matching_configs = [c for c in configs["items"] if c["name"] == config_name] - if matching_configs: - # Get the config directly from the API response - print(f"Using config from API: {config_name} Config ID: {matching_configs[0]['id']}", file=sys.stderr) - config = matching_configs[0].get("config", {}) - else: - print(f"No config found with name: {config_name}", file=sys.stderr) - print("Pre-commit hook failed: Config not found.", file=sys.stderr) - sys.exit(1) - else: - print("Failed to fetch configs from API", file=sys.stderr) - print("Pre-commit hook failed: Unable to fetch configs.", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"Error fetching config from API: {e}", file=sys.stderr) - print("Pre-commit hook failed: API error.", file=sys.stderr) +def load_config_from_api(config_name: str, token: str, instance_name: str, backend_url: str) -> dict: + """Load configuration from API.""" + print(f"Fetching config '{config_name}' from API...", file=sys.stderr) + try: + configs = get_all_dbt_configs(token, instance_name, backend_url) + if configs and "items" in configs: + matching_configs = [c for c in configs["items"] if c["name"] == config_name] + return get_config(config_name, matching_configs) + else: + print("Failed to fetch configs from API", file=sys.stderr) + print("Pre-commit hook failed: Unable to fetch configs.", file=sys.stderr) sys.exit(1) - else: - print("No config provided. Using default configuration.", file=sys.stderr) - config = {} - - base_path = "./" - if hasattr(args[0], "base_path") and args[0].base_path: - base_path = args[0].base_path[0] - print(f"Using base path: {base_path}", file=sys.stderr) + except Exception as e: + print(f"Error fetching config from API: {e}", file=sys.stderr) + print("Pre-commit hook failed: API error.", file=sys.stderr) + sys.exit(1) - # Get authentication parameters - token = getattr(args[0], "token", None) - instance_name = getattr(args[0], "instance_name", None) - backend_url = getattr(args[0], "backend_url", "https://api.myaltimate.com") - # Validate authentication parameters - if not token: - print("Warning: No API token provided. Using default configuration.", file=sys.stderr) - print("To specify a token, use: --token 'your-token'", file=sys.stderr) - - if not instance_name: - print("Warning: No instance name provided. Using default configuration.", file=sys.stderr) - print("To specify an instance, use: --instance-name 'your-instance'", file=sys.stderr) +def get_configuration(config_name: str, token: str, instance_name: str, backend_url: str, config_path: str) -> dict: + """Get configuration from file, API, or use defaults.""" + if config_path: + return load_config_from_file(config_path) + elif config_name and token and instance_name: + return load_config_from_api(config_name, token, instance_name, backend_url) + else: + print("No config provided. Using default configuration.", file=sys.stderr) + return {} - # Determine manifest and catalog paths - manifest_path = getattr(args[0], "manifest_path", None) - catalog_path = getattr(args[0], "catalog_path", None) - # Set default paths if not provided +def get_file_paths(base_path: str, manifest_path: str, catalog_path: str) -> Tuple[str, str]: + """Determine manifest and catalog file paths.""" if not manifest_path: manifest_path = str(Path(base_path) / "target" / "manifest.json") print(f"Using default manifest path: {manifest_path}", file=sys.stderr) @@ -166,101 +132,172 @@ def main(argv: Optional[Sequence[str]] = None): else: print(f"Using provided catalog path: {catalog_path}", file=sys.stderr) - # Load manifest + return manifest_path, catalog_path + + +def load_manifest_file(manifest_path: str): + """Load and validate manifest file.""" print("Loading manifest file...", file=sys.stderr) try: manifest = load_manifest(manifest_path) - if hasattr(manifest, "nodes"): - print(f"Manifest loaded successfully with {len(manifest.nodes)} nodes", file=sys.stderr) - elif hasattr(manifest, "get") and callable(manifest.get): - print(f"Manifest loaded successfully with {len(manifest.get('nodes', {}))} nodes", file=sys.stderr) - else: - print(f"Manifest loaded successfully, object type: {type(manifest).__name__}", file=sys.stderr) + node_count = get_node_count(manifest) + print(f"Manifest loaded successfully with {node_count} nodes", file=sys.stderr) + return manifest except Exception as e: print(f"Error loading manifest from {manifest_path}: {e}", file=sys.stderr) print("Pre-commit hook failed: Unable to load manifest file.", file=sys.stderr) sys.exit(1) - # Load catalog if available - catalog = None - if Path(catalog_path).exists(): - print("Loading catalog file...", file=sys.stderr) - try: - catalog = load_catalog(catalog_path) - if hasattr(catalog, "nodes"): - print(f"Catalog loaded successfully with {len(catalog.nodes)} nodes", file=sys.stderr) - elif hasattr(catalog, "get") and callable(catalog.get): - print(f"Catalog loaded successfully with {len(catalog.get('nodes', {}))} nodes", file=sys.stderr) - else: - print(f"Catalog loaded successfully, object type: {type(catalog).__name__}", file=sys.stderr) - except Exception as e: - print(f"Warning: Error loading catalog from {catalog_path}: {e}", file=sys.stderr) - print("Continuing without catalog...", file=sys.stderr) - catalog = None - else: + +def load_catalog_file(catalog_path: str): + """Load catalog file if it exists.""" + if not Path(catalog_path).exists(): print(f"Catalog file not found at {catalog_path}, continuing without catalog", file=sys.stderr) + return None + + print("Loading catalog file...", file=sys.stderr) + try: + catalog = load_catalog(catalog_path) + node_count = get_node_count(catalog) + print(f"Catalog loaded successfully with {node_count} nodes", file=sys.stderr) + return catalog + except Exception as e: + print(f"Warning: Error loading catalog from {catalog_path}: {e}", file=sys.stderr) + print("Continuing without catalog...", file=sys.stderr) + return None - # Get changed files for selective model testing - changed_files = args[1] - selected_models = [] +def get_node_count(obj) -> int: + """Get the number of nodes from manifest or catalog object.""" + if hasattr(obj, "nodes"): + return len(obj.nodes) + elif hasattr(obj, "get") and callable(obj.get): + return len(obj.get("nodes", {})) + else: + return 0 + + +def process_changed_files(changed_files: list) -> list: + """Process changed files for selective model testing.""" if changed_files: print(f"Processing {len(changed_files)} changed files for selective testing...", file=sys.stderr) print(f"Changed files: {', '.join(changed_files)}", file=sys.stderr) - # Extract model names from changed files for selective testing - # This could be enhanced to map file paths to model names - selected_models = changed_files + return changed_files else: print("No changed files detected. Running checks on all models.", file=sys.stderr) + return [] + + +def run_insight_generation(manifest, catalog, config, selected_models, token, instance_name, backend_url): + """Run the insight generation process.""" + print("Initializing DBT Insight Generator...", file=sys.stderr) + insight_generator = DBTInsightGenerator( + manifest=manifest, + catalog=catalog, + config=config, + selected_models=selected_models, + token=token, + instance_name=instance_name, + backend_url=backend_url, + ) + + print("Running insight generation...", file=sys.stderr) + return insight_generator.run() + + +def print_model_insights(model_report: dict): + """Print model insights in a formatted table.""" + print("--" * 50, file=sys.stderr) + print("Model Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + for model_id, report in model_report.items(): + print(f"Model: {model_id}", file=sys.stderr) + print(f"File path: {report['path']}", file=sys.stderr) + print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) + print("\n", file=sys.stderr) + + +def print_project_insights(project_report: dict): + """Print project insights in a formatted table.""" + print("--" * 50, file=sys.stderr) + print("Project Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + print(tabulate_data(project_report, headers="keys"), file=sys.stderr) + + +def process_reports(reports: dict) -> bool: + """Process and display reports, return True if issues found.""" + if not reports: + print("No insights generated. All checks passed!", file=sys.stderr) + return False + + print("Insights generated successfully. Analyzing results...", file=sys.stderr) + model_report = generate_model_insights_table(reports[MODEL]) + project_report = generate_project_insights_table(reports[PROJECT]) + + has_model_issues = len(model_report) > 0 + has_project_issues = len(project_report) > 0 + + if has_model_issues: + print_model_insights(model_report) + + if has_project_issues: + print_project_insights(project_report) + + if has_model_issues or has_project_issues: + print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) + return True + else: + print("All checks passed! No issues found.", file=sys.stderr) + return False + + +def log_warnings(token: str, instance_name: str): + """Log warnings for missing authentication parameters.""" + if not token: + print("Warning: No API token provided. Using default configuration.", file=sys.stderr) + print("To specify a token, use: --token 'your-token'", file=sys.stderr) + + if not instance_name: + print("Warning: No instance name provided. Using default configuration.", file=sys.stderr) + print("To specify an instance, use: --instance-name 'your-instance'", file=sys.stderr) + + +def main(argv: Optional[Sequence[str]] = None): + start_time = time.time() + print("Starting DataPilot pre-commit hook...", file=sys.stderr) + + # Parse arguments + parser = setup_argument_parser() + args = parser.parse_known_args(argv) + + # Extract arguments + config_name, token, instance_name, backend_url, config_path, base_path, manifest_path, catalog_path = extract_arguments(args[0]) + + # Get configuration + config = get_configuration(config_name, token, instance_name, backend_url, config_path) + + # Log warnings for missing auth parameters + log_warnings(token, instance_name) + + # Determine file paths + manifest_path, catalog_path = get_file_paths(base_path, manifest_path, catalog_path) + + # Load manifest and catalog + manifest = load_manifest_file(manifest_path) + catalog = load_catalog_file(catalog_path) + + # Process changed files + selected_models = process_changed_files(args[1]) try: - print("Initializing DBT Insight Generator...", file=sys.stderr) - insight_generator = DBTInsightGenerator( - manifest=manifest, - catalog=catalog, - config=config, - selected_models=selected_models, - token=token, - instance_name=instance_name, - backend_url=backend_url, - ) - - print("Running insight generation...", file=sys.stderr) - reports = insight_generator.run() - - if reports: - print("Insights generated successfully. Analyzing results...", file=sys.stderr) - model_report = generate_model_insights_table(reports[MODEL]) - project_report = generate_project_insights_table(reports[PROJECT]) - - # Check if there are actual issues found - has_model_issues = len(model_report) > 0 - has_project_issues = len(project_report) > 0 - - if has_model_issues: - print("--" * 50, file=sys.stderr) - print("Model Insights", file=sys.stderr) - print("--" * 50, file=sys.stderr) - for model_id, report in model_report.items(): - print(f"Model: {model_id}", file=sys.stderr) - print(f"File path: {report['path']}", file=sys.stderr) - print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) - print("\n", file=sys.stderr) - - if has_project_issues: - print("--" * 50, file=sys.stderr) - print("Project Insights", file=sys.stderr) - print("--" * 50, file=sys.stderr) - print(tabulate_data(project_report, headers="keys"), file=sys.stderr) - - # Only fail if there are actual issues - if has_model_issues or has_project_issues: - print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) - sys.exit(1) - else: - print("All checks passed! No issues found.", file=sys.stderr) - else: - print("No insights generated. All checks passed!", file=sys.stderr) + # Run insight generation + reports = run_insight_generation(manifest, catalog, config, selected_models, token, instance_name, backend_url) + + # Process results + has_issues = process_reports(reports) + if has_issues: + sys.exit(1) except Exception as e: print(f"Error running DataPilot checks: {e}", file=sys.stderr) From 38fadc3ac6d62546baf472bd0415d41c46a6e385 Mon Sep 17 00:00:00 2001 From: Pulkit Gaur Date: Thu, 21 Aug 2025 13:04:13 +0530 Subject: [PATCH 10/10] Update argument extraction in executor hook to include an additional return value - Modified the `extract_arguments` function to return an extra string, enhancing the argument handling capabilities. - This change supports future extensions and improves flexibility in argument parsing. --- src/datapilot/core/platforms/dbt/hooks/executor_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index e1e687d4..712694e1 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -60,7 +60,7 @@ def setup_argument_parser() -> argparse.ArgumentParser: return parser -def extract_arguments(args) -> Tuple[str, str, str, str, str, str, str]: +def extract_arguments(args) -> Tuple[str, str, str, str, str, str, str, str]: """Extract and return common arguments from parsed args.""" config_name = getattr(args, "config_name", None) token = getattr(args, "token", None)