From 97ad3752acad9627eef10a3f1c5ca2ca4f62c638 Mon Sep 17 00:00:00 2001 From: NUNO MIGUEL DA SILVA SALVACAO Date: Mon, 16 Mar 2026 15:14:57 +0000 Subject: [PATCH 1/2] fix(cli): canonicalize legacy cli_crawler command surface Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli_crawler.py | 101 ++------------------------ src/crawler/cli_crawler.py | 69 +++++++++++++++++- tests/unit/test_cli_crawler_compat.py | 54 ++++++++++++++ 3 files changed, 124 insertions(+), 100 deletions(-) create mode 100644 tests/unit/test_cli_crawler_compat.py diff --git a/cli_crawler.py b/cli_crawler.py index 94b0e55..7189671 100644 --- a/cli_crawler.py +++ b/cli_crawler.py @@ -1,103 +1,12 @@ #!/usr/bin/env python3 -"""Universal CLI Help Crawler - OpenAPI for CLIs. +"""Legacy compatibility script for ``cli-crawler``. -Crawls CLI --help outputs and generates structured JSON maps -that AI agents can use for precise command reasoning. -""" - -from __future__ import annotations - -import argparse -import logging -import sys -from pathlib import Path - -from crawler.config import CLIConfig, CrawlerConfig, load_config -from crawler.pipeline import crawl_all, crawl_cli - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Crawl CLI --help outputs and generate structured JSON maps", - epilog="Examples:\n" - " python cli_crawler.py git -o output/git.json\n" - " python cli_crawler.py --config config.yaml --all\n" - " python cli_crawler.py docker -v --include-raw\n", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument("cli", nargs="?", help="CLI to crawl (e.g., git, docker)") - parser.add_argument("--config", "-c", type=Path, help="Path to config YAML") - parser.add_argument("--output", "-o", type=Path, help="Output file path") - parser.add_argument( - "--output-dir", - type=Path, - default=Path("./output"), - help="Output directory (default: ./output)", - ) - parser.add_argument("--all", action="store_true", help="Crawl all CLIs in config") - parser.add_argument( - "--include-raw", action="store_true", help="Include raw help text in main JSON" - ) - parser.add_argument("--verbose", "-v", action="store_true", help="Verbose logging") - parser.add_argument("--strict", action="store_true", help="Fail on first parse error") - parser.add_argument("--max-depth", type=int, help="Override max recursion depth") - parser.add_argument("--timeout", type=int, help="Override timeout per command (seconds)") - parser.add_argument("--list", action="store_true", help="List configured CLIs and exit") - - args = parser.parse_args() - - # Configure logging - logging.basicConfig( - level=logging.DEBUG if args.verbose else logging.INFO, - format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", - datefmt="%H:%M:%S", - ) +Prefer invoking the canonical command directly: - # Load config - config: CrawlerConfig - if args.config and args.config.exists(): - config = load_config(str(args.config)) - else: - config = CrawlerConfig() - - # List mode - if args.list: - if not config.clis: - print("No CLIs configured. Use --config to specify a config file.") - else: - print(f"Configured CLIs ({len(config.clis)}):") - for name, cfg in sorted(config.clis.items()): - group = f" [{cfg.group}]" if cfg.group else "" - env = f" (env: {cfg.environment})" if cfg.environment != "wsl" else "" - print(f" {name}{group}{env}") - return - - # Crawl all CLIs - if args.all: - if not config.clis: - print("No CLIs configured. Use --config to specify a config file.") - sys.exit(1) - crawl_all(config, args.output_dir, args.include_raw, args.strict) - return - - # Crawl single CLI - if args.cli: - cli_config = config.clis.get(args.cli, CLIConfig(name=args.cli)) - - # Apply CLI arg overrides - if args.max_depth is not None: - cli_config.max_depth = args.max_depth - if args.timeout is not None: - cli_config.timeout = args.timeout - - output = args.output or args.output_dir / f"{args.cli}.json" - crawl_cli(args.cli, cli_config, output, args.include_raw, args.strict) - return - - # No action specified - parser.print_help() - sys.exit(1) + cli-crawler [options] +""" +from crawler.cli_crawler import main if __name__ == "__main__": main() diff --git a/src/crawler/cli_crawler.py b/src/crawler/cli_crawler.py index 1c72e35..7309a1b 100644 --- a/src/crawler/cli_crawler.py +++ b/src/crawler/cli_crawler.py @@ -1,11 +1,74 @@ -"""Compatibility crawler entrypoint and basic help execution helpers (T013).""" +"""Compatibility wrappers for legacy ``cli_crawler`` entrypoints.""" from __future__ import annotations +import sys +from pathlib import Path + +from . import pipeline as _pipeline from .config import CLIConfig, CrawlerConfig from .executor import Executor from .models import CLIMap, ExecutionResult -from .pipeline import crawl_all, crawl_cli, main +from .pipeline import crawl_all, crawl_cli + +_RAW_FLAG = "--raw" +_LEGACY_RAW_FLAG = "--include-raw" + + +def _normalize_legacy_args(argv: list[str]) -> tuple[list[str], list[str]]: + """Normalize legacy flags to canonical ``cli-crawler`` arguments.""" + normalized: list[str] = [] + warnings: list[str] = [] + raw_enabled = False + legacy_raw_seen = False + + for arg in argv: + if arg == _RAW_FLAG: + raw_enabled = True + normalized.append(arg) + continue + if arg == _LEGACY_RAW_FLAG: + legacy_raw_seen = True + if not raw_enabled: + normalized.append(_RAW_FLAG) + raw_enabled = True + continue + normalized.append(arg) + + if legacy_raw_seen: + if _RAW_FLAG in argv: + warnings.append( + f"{_LEGACY_RAW_FLAG} is deprecated and ignored when {_RAW_FLAG} is also provided." + ) + else: + warnings.append(f"{_LEGACY_RAW_FLAG} is deprecated; treating it as {_RAW_FLAG}.") + + return normalized, warnings + + +def _legacy_entrypoint_warning(program_name: str) -> str | None: + """Return a warning when executed via legacy file-based entrypoint.""" + if Path(program_name).name == "cli_crawler.py": + return "Legacy entrypoint detected. Prefer the canonical command: cli-crawler." + return None + + +def main() -> None: + """Compatibility entrypoint that delegates to ``crawler.pipeline.main``.""" + original_argv = sys.argv[:] + normalized_args, warnings = _normalize_legacy_args(original_argv[1:]) + legacy_warning = _legacy_entrypoint_warning(original_argv[0]) + if legacy_warning: + warnings.insert(0, legacy_warning) + + for warning in warnings: + print(f"warning: {warning}", file=sys.stderr) + + sys.argv = [original_argv[0], *normalized_args] + try: + _pipeline.main() + finally: + sys.argv = original_argv def run_root_help(cli_name: str, config: CLIConfig | None = None) -> ExecutionResult: @@ -26,8 +89,6 @@ def crawl_single( def crawl_configured(config: CrawlerConfig, output_dir: str = "output") -> list[CLIMap]: """Compatibility helper for crawling all configured CLIs.""" - from pathlib import Path - return crawl_all(config, Path(output_dir)) diff --git a/tests/unit/test_cli_crawler_compat.py b/tests/unit/test_cli_crawler_compat.py new file mode 100644 index 0000000..e653455 --- /dev/null +++ b/tests/unit/test_cli_crawler_compat.py @@ -0,0 +1,54 @@ +"""Compatibility tests for legacy ``cli_crawler`` wrappers.""" + +from __future__ import annotations + +import sys + +import pytest + +from crawler import cli_crawler + + +def test_main_maps_include_raw_to_raw( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """Legacy ``--include-raw`` should map to canonical ``--raw``.""" + monkeypatch.setattr(sys, "argv", ["cli_crawler.py", "git", "--include-raw"]) + captured_argv: list[str] = [] + + def _fake_pipeline_main() -> None: + captured_argv.extend(sys.argv) + + monkeypatch.setattr(cli_crawler._pipeline, "main", _fake_pipeline_main) + + cli_crawler.main() + + assert captured_argv == ["cli_crawler.py", "git", "--raw"] + stderr = capsys.readouterr().err + assert "cli-crawler" in stderr + assert "--include-raw is deprecated; treating it as --raw." in stderr + + +def test_main_prefers_raw_when_both_raw_flags_are_present( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """When both flags are present, keep one canonical ``--raw`` only.""" + monkeypatch.setattr( + sys, + "argv", + ["cli_crawler.py", "git", "--raw", "--include-raw"], + ) + captured_argv: list[str] = [] + + def _fake_pipeline_main() -> None: + captured_argv.extend(sys.argv) + + monkeypatch.setattr(cli_crawler._pipeline, "main", _fake_pipeline_main) + + cli_crawler.main() + + assert captured_argv == ["cli_crawler.py", "git", "--raw"] + stderr = capsys.readouterr().err + assert "--include-raw is deprecated and ignored when --raw is also provided." in stderr From 0192f75c4cfa1fe66da2d88f17f7ca38aa26fdbc Mon Sep 17 00:00:00 2001 From: NUNO MIGUEL DA SILVA SALVACAO Date: Mon, 16 Mar 2026 15:41:54 +0000 Subject: [PATCH 2/2] Update src/crawler/cli_crawler.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/crawler/cli_crawler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/crawler/cli_crawler.py b/src/crawler/cli_crawler.py index 7309a1b..56fc458 100644 --- a/src/crawler/cli_crawler.py +++ b/src/crawler/cli_crawler.py @@ -24,16 +24,16 @@ def _normalize_legacy_args(argv: list[str]) -> tuple[list[str], list[str]]: for arg in argv: if arg == _RAW_FLAG: + if not raw_enabled: + normalized.append(arg) raw_enabled = True - normalized.append(arg) - continue - if arg == _LEGACY_RAW_FLAG: + elif arg == _LEGACY_RAW_FLAG: legacy_raw_seen = True if not raw_enabled: normalized.append(_RAW_FLAG) raw_enabled = True - continue - normalized.append(arg) + else: + normalized.append(arg) if legacy_raw_seen: if _RAW_FLAG in argv: