Cli (#127)

JaredforReal · web-flow · commit c2a28d6bddf0 · 2026-03-12T18:27:29.000+08:00
* set maas default, add env and mode argument in CLI, update SKILL

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

* update readme

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

* release v0.1.2

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

* accept some reviews

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

---------

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -13,7 +13,6 @@
     📍 Use GLM-OCR's <a href="https://docs.z.ai/guides/vlm/glm-ocr" target="_blank">API</a>
 </p>
 
-
 ### Model Introduction
 
 GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture. It introduces Multi-Token Prediction (MTP) loss and stable full-task reinforcement learning to improve training efficiency, recognition accuracy, and generalization. The model integrates the CogViT visual encoder pre-trained on large-scale image–text data, a lightweight cross-modal connector with efficient token downsampling, and a GLM-0.5B language decoder. Combined with a two-stage pipeline of layout analysis and parallel recognition based on PP-DocLayout-V3, GLM-OCR delivers robust and high-quality OCR performance across diverse document layouts.
@@ -30,6 +29,7 @@ GLM-OCR is a multimodal OCR model for complex document understanding, built on t
 
 ### News & Updates
 
+- **[2026.3.12]** GLM-OCR SDK now supports agent-friendly Skill mode — just `pip install glmocr` + set API key, ready to use via CLI or Python with no GPU or YAML config needed. See: [GLM-OCR Skill](glmocr_skill/SKILL.md)
 - **[2026.3.12]** GLM-OCR Technical Report is now available. See: [GLM-OCR Technical Report](https://arxiv.org/abs/2603.10910)
 - **[2026.2.12]** Fine-tuning tutorial based on LLaMA-Factory is now available. See: [GLM-OCR Fine-tuning Guide](examples/finetune/README.md)
 
diff --git a/README_zh.md b/README_zh.md
@@ -29,6 +29,7 @@ GLM-OCR 是一款面向复杂文档理解的多模态 OCR 模型，基于 GLM-V
 
 ### 最新动态
 
+- **[2026.3.12]** GLM-OCR SDK 新增 Agent Skill 模式 — `pip install glmocr` + 配置 API Key，即可通过 CLI 或 Python 直接使用，无需 GPU 和 YAML 配置。详情见：[GLM-OCR Skill](glmocr_skill/SKILL.md)
 - **[2026.3.12]** GLM-OCR 技术报告已上线，详情见：[GLM-OCR 技术报告](https://arxiv.org/abs/2603.10910)
 - **[2026.2.12]** 基于 LLaMA-Factory 的微调教程上线，详情见： [GLM-OCR 微调教程](examples/finetune/README_zh.md)
 
diff --git a/glmocr/__init__.py b/glmocr/__init__.py
@@ -18,6 +18,7 @@
     "GlmOcrConfig",
     "load_config",
     "MaaSClient",
+    "MissingApiKeyError",
     "GlmOcr",
     "parse",
 ]
@@ -30,6 +31,7 @@
     "GlmOcrConfig": ("config", "GlmOcrConfig"),
     "load_config": ("config", "load_config"),
     "MaaSClient": ("maas_client", "MaaSClient"),
+    "MissingApiKeyError": ("maas_client", "MissingApiKeyError"),
     "GlmOcr": ("api", "GlmOcr"),
     "parse": ("api", "parse"),
 }
@@ -56,6 +58,6 @@ def __dir__():
     from . import dataloader, layout, postprocess, utils
     from .api import GlmOcr, parse
     from .config import GlmOcrConfig, load_config
-    from .maas_client import MaaSClient
+    from .maas_client import MaaSClient, MissingApiKeyError
     from .parser_result import PipelineResult
     from .pipeline import Pipeline
diff --git a/glmocr/api.py b/glmocr/api.py
@@ -18,6 +18,7 @@
     print(results[0].to_dict())
 """
 
+import os
 import re
 from typing import Any, Dict, Generator, List, Literal, Optional, Union, overload
 from pathlib import Path
@@ -71,6 +72,7 @@ def __init__(
         timeout: Optional[int] = None,
         enable_layout: Optional[bool] = None,
         log_level: Optional[str] = None,
+        env_file: Optional[str] = None,
         # Extra knobs for self-hosted mode & GPU binding
         ocr_api_host: Optional[str] = None,
         ocr_api_port: Optional[int] = None,
@@ -92,9 +94,13 @@ def __init__(
             timeout:  Request timeout in seconds.
             enable_layout: Whether to run layout detection.
             log_level: Logging level (DEBUG, INFO, WARNING, ERROR).
+            env_file: Path to a ``.env`` file to load API key and other settings from.
         """
-        # If user provides api_key but no explicit mode, default to MaaS.
-        if api_key is not None and mode is None:
+        # If an API key is available (constructor arg or env var), default to MaaS.
+        # This ensures `GlmOcr()` with GLMOCR_API_KEY in env auto-selects MaaS
+        # even when the user has an old YAML with maas.enabled=false.
+        _has_api_key = api_key is not None or bool(os.environ.get("GLMOCR_API_KEY"))
+        if _has_api_key and mode is None:
             mode = "maas"
 
         # Build config: overrides > env vars > YAML > defaults
@@ -107,6 +113,7 @@ def __init__(
             timeout=timeout,
             enable_layout=enable_layout,
             log_level=log_level,
+            env_file=env_file,
             ocr_api_host=ocr_api_host,
             ocr_api_port=ocr_api_port,
             cuda_visible_devices=cuda_visible_devices,
@@ -592,6 +599,7 @@ def parse(
     timeout: Optional[int] = None,
     enable_layout: Optional[bool] = None,
     log_level: Optional[str] = None,
+    env_file: Optional[str] = None,
     **kwargs: Any,
 ) -> Union[PipelineResult, List[PipelineResult], Generator[PipelineResult, None, None]]:
     """Convenience function: parse images or documents in one call.
@@ -657,6 +665,7 @@ def parse(
         timeout=timeout,
         enable_layout=enable_layout,
         log_level=log_level,
+        env_file=env_file,
     ) as parser:
         return parser.parse(
             images,
diff --git a/glmocr/cli.py b/glmocr/cli.py
@@ -11,6 +11,7 @@
 from typing import List
 
 from glmocr.api import GlmOcr
+from glmocr.maas_client import MissingApiKeyError
 from glmocr.utils.logging import get_logger, configure_logging
 
 logger = get_logger(__name__)
@@ -60,19 +61,28 @@ def main():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
     Examples:
-    # Parse a single image file
+    # Parse a single image (uses GLMOCR_API_KEY from environment)
   glmocr parse image.png
 
+    # Pass API key directly (no env setup needed)
+  glmocr parse image.png --api-key sk-xxx
+
     # Parse all images in a directory
-  glmocr parse ./images/
+  glmocr parse ./images/ --api-key sk-xxx
 
-    # Disable layout detection (OCR-only): set pipeline.enable_layout=false
-    glmocr parse image.png --config my_config.yaml
+    # Use self-hosted vLLM/SGLang instead of cloud API
+  glmocr parse image.png --mode selfhosted
 
     # Specify output directory
   glmocr parse image.png --output ./output/
 
-    # Specify config file
+    # Print results to stdout only (no files written)
+  glmocr parse image.png --api-key sk-xxx --stdout --no-save
+
+    # Load API key from a specific .env file
+  glmocr parse image.png --env-file /path/to/.env
+
+    # Specify custom config file
   glmocr parse image.png --config config.yaml
         """,
     )
@@ -118,6 +128,26 @@ def main():
         action="store_true",
         help="Output results to standard output (JSON format)",
     )
+    parse_parser.add_argument(
+        "--api-key",
+        "-k",
+        type=str,
+        default=None,
+        help="API key for MaaS mode (overrides GLMOCR_API_KEY env var)",
+    )
+    parse_parser.add_argument(
+        "--mode",
+        type=str,
+        default=None,
+        choices=["maas", "selfhosted"],
+        help="Operation mode: 'maas' (cloud API, default) or 'selfhosted' (local vLLM/SGLang)",
+    )
+    parse_parser.add_argument(
+        "--env-file",
+        type=str,
+        default=None,
+        help="Path to .env file to load GLMOCR_API_KEY and other settings from",
+    )
     parse_parser.add_argument(
         "--log-level",
         type=str,
@@ -144,7 +174,12 @@ def main():
         # Use GlmOcr API
         save_layout_vis = not args.no_layout_vis
 
-        with GlmOcr(config_path=args.config) as glm_parser:
+        with GlmOcr(
+            config_path=args.config,
+            api_key=args.api_key,
+            mode=args.mode,
+            env_file=args.env_file,
+        ) as glm_parser:
             logger.info(
                 "Using Pipeline (enable_layout=%s)...",
                 "true" if glm_parser.enable_layout else "false",
@@ -200,6 +235,17 @@ def main():
     except KeyboardInterrupt:
         logger.info("Interrupted by user")
         sys.exit(1)
+    except MissingApiKeyError as e:
+        logger.error(
+            "%s\n\n"
+            "  Quick fix:\n"
+            "    export GLMOCR_API_KEY=sk-xxx          # set once in shell\n"
+            "    glmocr parse image.png --api-key sk-xxx  # or pass directly\n\n"
+            "  Get your free key at: https://open.bigmodel.cn",
+            e,
+        )
+        logger.debug(traceback.format_exc())
+        sys.exit(1)
     except Exception as e:
         logger.error("Error: %s", e)
         logger.debug(traceback.format_exc())
diff --git a/glmocr/config.py b/glmocr/config.py
@@ -113,7 +113,8 @@ class MaaSApiConfig(_BaseConfig):
     """
 
     # Enable MaaS mode (passthrough to Zhipu cloud API)
-    enabled: bool = False
+    # Default: True — MaaS is the default mode after `pip install glmocr` (no GPU needed)
+    enabled: bool = True
 
     # API endpoint (default: Zhipu GLM-OCR layout_parsing API)
     api_url: str = "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
@@ -235,14 +236,26 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any:
     return raw
 
 
-def _collect_env_overrides() -> Dict[str, Any]:
+def _collect_env_overrides(
+    env_file: Optional[Union[str, Path]] = None,
+) -> Dict[str, Any]:
     """Read GLMOCR_* values from ``.env`` file + real environment variables.
 
+    Args:
+        env_file: Explicit path to a ``.env`` file.  When provided, this file
+            is used instead of the auto-discovered one.  Raises
+            ``FileNotFoundError`` if the path does not exist.
+
     Priority: real ``os.environ`` > ``.env`` file.  This means a user can
     always override a ``.env`` value by exporting the variable in the shell.
     """
     # 1. Load .env file (does NOT mutate os.environ)
-    dotenv_path = _find_dotenv()
+    if env_file is not None:
+        dotenv_path = Path(env_file)
+        if not dotenv_path.is_file():
+            raise FileNotFoundError(f".env file not found: {dotenv_path}")
+    else:
+        dotenv_path = _find_dotenv()
     dotenv_vars: Dict[str, Optional[str]] = (
         dotenv_values(dotenv_path) if dotenv_path else {}
     )
@@ -319,6 +332,7 @@ def from_env(
         * ``timeout``        – request timeout in seconds
         * ``enable_layout``  – whether to run layout detection
         * ``log_level``      – logging level (DEBUG / INFO / …)
+        * ``env_file``       – explicit path to a ``.env`` file
 
         Any other keyword is silently ignored so that callers can safely
         forward ``**kwargs`` without worrying about typos crashing the SDK.
@@ -349,7 +363,8 @@ def from_env(
             data = {}
 
         # 2. Environment variable overrides
-        env_data = _collect_env_overrides()
+        env_file = overrides.pop("env_file", None)
+        env_data = _collect_env_overrides(env_file=env_file)
         if env_data:
             _deep_merge(data, env_data)
 
diff --git a/glmocr/config.yaml b/glmocr/config.yaml
@@ -32,7 +32,7 @@ pipeline:
   #
   # Note: When maas.enabled=true, the ocr_api and layout settings below are ignored.
   maas:
-    enabled: false # Set to true to use MaaS mode
+    enabled: true # Set to true to use MaaS mode
     api_url: https://open.bigmodel.cn/api/paas/v4/layout_parsing
     model: glm-ocr
     api_key: null # Required! Get from https://open.bigmodel.cn
diff --git a/glmocr/maas_client.py b/glmocr/maas_client.py
@@ -34,6 +34,19 @@
 profiler = get_profiler(__name__)
 
 
+class MissingApiKeyError(ValueError):
+    """Raised when MaaS mode is active but no API key is configured."""
+
+    def __init__(self) -> None:
+        super().__init__(
+            "MaaS mode requires an API key.\n"
+            "  Option 1 (env var):  export GLMOCR_API_KEY=sk-xxx\n"
+            "  Option 2 (CLI flag): glmocr parse image.png --api-key sk-xxx\n"
+            '  Option 3 (Python):   GlmOcr(api_key="sk-xxx")\n'
+            "  Get your key at:     https://open.bigmodel.cn"
+        )
+
+
 # Default MaaS API endpoint
 DEFAULT_MAAS_URL = "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
 DEFAULT_MAAS_MODEL = "glm-ocr"
@@ -103,10 +116,7 @@ def __init__(self, config: "MaaSApiConfig"):
         # Authentication
         self.api_key = config.api_key or os.getenv("GLMOCR_API_KEY")
         if not self.api_key:
-            raise ValueError(
-                "API key is required for MaaS mode. "
-                "Set it in config.yaml or GLMOCR_API_KEY environment variable."
-            )
+            raise MissingApiKeyError()
 
         # SSL verification
         self.verify_ssl = config.verify_ssl
diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py
@@ -341,20 +341,19 @@ def test_maas_config_defaults(self):
         from glmocr.config import MaaSApiConfig
 
         config = MaaSApiConfig()
-        assert config.enabled is False
+        assert config.enabled is True
         assert config.api_url == "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
         assert config.model == "glm-ocr"
         assert config.verify_ssl is True
 
     def test_maas_client_requires_api_key(self):
-        """MaaSClient raises error when API key is missing."""
-        from glmocr.maas_client import MaaSClient
+        """MaaSClient raises MissingApiKeyError when API key is missing."""
+        from glmocr.maas_client import MaaSClient, MissingApiKeyError
         from glmocr.config import MaaSApiConfig
 
         config = MaaSApiConfig(api_key=None)
-        with pytest.raises(ValueError) as exc:
+        with pytest.raises(MissingApiKeyError):
             MaaSClient(config)
-        assert "API key is required" in str(exc.value)
 
     def test_maas_client_init_with_api_key(self):
         """MaaSClient initializes correctly with API key."""
@@ -506,7 +505,7 @@ def test_config_maas_in_pipeline(self):
 
         config = PipelineConfig()
         assert hasattr(config, "maas")
-        assert config.maas.enabled is False
+        assert config.maas.enabled is True
 
 
 # ═══════════════════════════════════════════════════════════════════════
@@ -707,7 +706,7 @@ def test_defaults_when_nothing_set(self, monkeypatch):
         monkeypatch.setattr("glmocr.config._find_dotenv", lambda: None)
 
         cfg = GlmOcrConfig.from_env()
-        assert cfg.pipeline.maas.enabled is False
+        assert cfg.pipeline.maas.enabled is True
         assert cfg.logging.level == "INFO"
 
     def test_overrides_win_over_env(self, monkeypatch):
diff --git a/glmocr_skill/SKILL.md b/glmocr_skill/SKILL.md
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ pipeline:`
`32`	`32`	`#`
`33`	`33`	`# Note: When maas.enabled=true, the ocr_api and layout settings below are ignored.`
`34`	`34`	`maas:`
`35`		`- enabled: false # Set to true to use MaaS mode`
	`35`	`+ enabled: true # Set to true to use MaaS mode`
`36`	`36`	`api_url: https://open.bigmodel.cn/api/paas/v4/layout_parsing`
`37`	`37`	`model: glm-ocr`
`38`	`38`	`api_key: null # Required! Get from https://open.bigmodel.cn`