Skip to content

Commit c2a28d6

Browse files
authored
Cli (#127)
* set maas default, add env and mode argument in CLI, update SKILL Signed-off-by: JaredforReal <w13431838023@gmail.com> * update readme Signed-off-by: JaredforReal <w13431838023@gmail.com> * release v0.1.2 Signed-off-by: JaredforReal <w13431838023@gmail.com> * accept some reviews Signed-off-by: JaredforReal <w13431838023@gmail.com> --------- Signed-off-by: JaredforReal <w13431838023@gmail.com>
1 parent d25bf38 commit c2a28d6

File tree

11 files changed

+137
-39
lines changed

11 files changed

+137
-39
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
📍 Use GLM-OCR's <a href="https://docs.z.ai/guides/vlm/glm-ocr" target="_blank">API</a>
1414
</p>
1515

16-
1716
### Model Introduction
1817

1918
GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture. It introduces Multi-Token Prediction (MTP) loss and stable full-task reinforcement learning to improve training efficiency, recognition accuracy, and generalization. The model integrates the CogViT visual encoder pre-trained on large-scale image–text data, a lightweight cross-modal connector with efficient token downsampling, and a GLM-0.5B language decoder. Combined with a two-stage pipeline of layout analysis and parallel recognition based on PP-DocLayout-V3, GLM-OCR delivers robust and high-quality OCR performance across diverse document layouts.
@@ -30,6 +29,7 @@ GLM-OCR is a multimodal OCR model for complex document understanding, built on t
3029

3130
### News & Updates
3231

32+
- **[2026.3.12]** GLM-OCR SDK now supports agent-friendly Skill mode — just `pip install glmocr` + set API key, ready to use via CLI or Python with no GPU or YAML config needed. See: [GLM-OCR Skill](glmocr_skill/SKILL.md)
3333
- **[2026.3.12]** GLM-OCR Technical Report is now available. See: [GLM-OCR Technical Report](https://arxiv.org/abs/2603.10910)
3434
- **[2026.2.12]** Fine-tuning tutorial based on LLaMA-Factory is now available. See: [GLM-OCR Fine-tuning Guide](examples/finetune/README.md)
3535

README_zh.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ GLM-OCR 是一款面向复杂文档理解的多模态 OCR 模型,基于 GLM-V
2929

3030
### 最新动态
3131

32+
- **[2026.3.12]** GLM-OCR SDK 新增 Agent Skill 模式 — `pip install glmocr` + 配置 API Key,即可通过 CLI 或 Python 直接使用,无需 GPU 和 YAML 配置。详情见:[GLM-OCR Skill](glmocr_skill/SKILL.md)
3233
- **[2026.3.12]** GLM-OCR 技术报告已上线,详情见:[GLM-OCR 技术报告](https://arxiv.org/abs/2603.10910)
3334
- **[2026.2.12]** 基于 LLaMA-Factory 的微调教程上线,详情见: [GLM-OCR 微调教程](examples/finetune/README_zh.md)
3435

glmocr/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"GlmOcrConfig",
1919
"load_config",
2020
"MaaSClient",
21+
"MissingApiKeyError",
2122
"GlmOcr",
2223
"parse",
2324
]
@@ -30,6 +31,7 @@
3031
"GlmOcrConfig": ("config", "GlmOcrConfig"),
3132
"load_config": ("config", "load_config"),
3233
"MaaSClient": ("maas_client", "MaaSClient"),
34+
"MissingApiKeyError": ("maas_client", "MissingApiKeyError"),
3335
"GlmOcr": ("api", "GlmOcr"),
3436
"parse": ("api", "parse"),
3537
}
@@ -56,6 +58,6 @@ def __dir__():
5658
from . import dataloader, layout, postprocess, utils
5759
from .api import GlmOcr, parse
5860
from .config import GlmOcrConfig, load_config
59-
from .maas_client import MaaSClient
61+
from .maas_client import MaaSClient, MissingApiKeyError
6062
from .parser_result import PipelineResult
6163
from .pipeline import Pipeline

glmocr/api.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
print(results[0].to_dict())
1919
"""
2020

21+
import os
2122
import re
2223
from typing import Any, Dict, Generator, List, Literal, Optional, Union, overload
2324
from pathlib import Path
@@ -71,6 +72,7 @@ def __init__(
7172
timeout: Optional[int] = None,
7273
enable_layout: Optional[bool] = None,
7374
log_level: Optional[str] = None,
75+
env_file: Optional[str] = None,
7476
# Extra knobs for self-hosted mode & GPU binding
7577
ocr_api_host: Optional[str] = None,
7678
ocr_api_port: Optional[int] = None,
@@ -92,9 +94,13 @@ def __init__(
9294
timeout: Request timeout in seconds.
9395
enable_layout: Whether to run layout detection.
9496
log_level: Logging level (DEBUG, INFO, WARNING, ERROR).
97+
env_file: Path to a ``.env`` file to load API key and other settings from.
9598
"""
96-
# If user provides api_key but no explicit mode, default to MaaS.
97-
if api_key is not None and mode is None:
99+
# If an API key is available (constructor arg or env var), default to MaaS.
100+
# This ensures `GlmOcr()` with GLMOCR_API_KEY in env auto-selects MaaS
101+
# even when the user has an old YAML with maas.enabled=false.
102+
_has_api_key = api_key is not None or bool(os.environ.get("GLMOCR_API_KEY"))
103+
if _has_api_key and mode is None:
98104
mode = "maas"
99105

100106
# Build config: overrides > env vars > YAML > defaults
@@ -107,6 +113,7 @@ def __init__(
107113
timeout=timeout,
108114
enable_layout=enable_layout,
109115
log_level=log_level,
116+
env_file=env_file,
110117
ocr_api_host=ocr_api_host,
111118
ocr_api_port=ocr_api_port,
112119
cuda_visible_devices=cuda_visible_devices,
@@ -592,6 +599,7 @@ def parse(
592599
timeout: Optional[int] = None,
593600
enable_layout: Optional[bool] = None,
594601
log_level: Optional[str] = None,
602+
env_file: Optional[str] = None,
595603
**kwargs: Any,
596604
) -> Union[PipelineResult, List[PipelineResult], Generator[PipelineResult, None, None]]:
597605
"""Convenience function: parse images or documents in one call.
@@ -657,6 +665,7 @@ def parse(
657665
timeout=timeout,
658666
enable_layout=enable_layout,
659667
log_level=log_level,
668+
env_file=env_file,
660669
) as parser:
661670
return parser.parse(
662671
images,

glmocr/cli.py

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import List
1212

1313
from glmocr.api import GlmOcr
14+
from glmocr.maas_client import MissingApiKeyError
1415
from glmocr.utils.logging import get_logger, configure_logging
1516

1617
logger = get_logger(__name__)
@@ -60,19 +61,28 @@ def main():
6061
formatter_class=argparse.RawDescriptionHelpFormatter,
6162
epilog="""
6263
Examples:
63-
# Parse a single image file
64+
# Parse a single image (uses GLMOCR_API_KEY from environment)
6465
glmocr parse image.png
6566
67+
# Pass API key directly (no env setup needed)
68+
glmocr parse image.png --api-key sk-xxx
69+
6670
# Parse all images in a directory
67-
glmocr parse ./images/
71+
glmocr parse ./images/ --api-key sk-xxx
6872
69-
# Disable layout detection (OCR-only): set pipeline.enable_layout=false
70-
glmocr parse image.png --config my_config.yaml
73+
# Use self-hosted vLLM/SGLang instead of cloud API
74+
glmocr parse image.png --mode selfhosted
7175
7276
# Specify output directory
7377
glmocr parse image.png --output ./output/
7478
75-
# Specify config file
79+
# Print results to stdout only (no files written)
80+
glmocr parse image.png --api-key sk-xxx --stdout --no-save
81+
82+
# Load API key from a specific .env file
83+
glmocr parse image.png --env-file /path/to/.env
84+
85+
# Specify custom config file
7686
glmocr parse image.png --config config.yaml
7787
""",
7888
)
@@ -118,6 +128,26 @@ def main():
118128
action="store_true",
119129
help="Output results to standard output (JSON format)",
120130
)
131+
parse_parser.add_argument(
132+
"--api-key",
133+
"-k",
134+
type=str,
135+
default=None,
136+
help="API key for MaaS mode (overrides GLMOCR_API_KEY env var)",
137+
)
138+
parse_parser.add_argument(
139+
"--mode",
140+
type=str,
141+
default=None,
142+
choices=["maas", "selfhosted"],
143+
help="Operation mode: 'maas' (cloud API, default) or 'selfhosted' (local vLLM/SGLang)",
144+
)
145+
parse_parser.add_argument(
146+
"--env-file",
147+
type=str,
148+
default=None,
149+
help="Path to .env file to load GLMOCR_API_KEY and other settings from",
150+
)
121151
parse_parser.add_argument(
122152
"--log-level",
123153
type=str,
@@ -144,7 +174,12 @@ def main():
144174
# Use GlmOcr API
145175
save_layout_vis = not args.no_layout_vis
146176

147-
with GlmOcr(config_path=args.config) as glm_parser:
177+
with GlmOcr(
178+
config_path=args.config,
179+
api_key=args.api_key,
180+
mode=args.mode,
181+
env_file=args.env_file,
182+
) as glm_parser:
148183
logger.info(
149184
"Using Pipeline (enable_layout=%s)...",
150185
"true" if glm_parser.enable_layout else "false",
@@ -200,6 +235,17 @@ def main():
200235
except KeyboardInterrupt:
201236
logger.info("Interrupted by user")
202237
sys.exit(1)
238+
except MissingApiKeyError as e:
239+
logger.error(
240+
"%s\n\n"
241+
" Quick fix:\n"
242+
" export GLMOCR_API_KEY=sk-xxx # set once in shell\n"
243+
" glmocr parse image.png --api-key sk-xxx # or pass directly\n\n"
244+
" Get your free key at: https://open.bigmodel.cn",
245+
e,
246+
)
247+
logger.debug(traceback.format_exc())
248+
sys.exit(1)
203249
except Exception as e:
204250
logger.error("Error: %s", e)
205251
logger.debug(traceback.format_exc())

glmocr/config.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ class MaaSApiConfig(_BaseConfig):
113113
"""
114114

115115
# Enable MaaS mode (passthrough to Zhipu cloud API)
116-
enabled: bool = False
116+
# Default: True — MaaS is the default mode after `pip install glmocr` (no GPU needed)
117+
enabled: bool = True
117118

118119
# API endpoint (default: Zhipu GLM-OCR layout_parsing API)
119120
api_url: str = "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
@@ -235,14 +236,26 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any:
235236
return raw
236237

237238

238-
def _collect_env_overrides() -> Dict[str, Any]:
239+
def _collect_env_overrides(
240+
env_file: Optional[Union[str, Path]] = None,
241+
) -> Dict[str, Any]:
239242
"""Read GLMOCR_* values from ``.env`` file + real environment variables.
240243
244+
Args:
245+
env_file: Explicit path to a ``.env`` file. When provided, this file
246+
is used instead of the auto-discovered one. Raises
247+
``FileNotFoundError`` if the path does not exist.
248+
241249
Priority: real ``os.environ`` > ``.env`` file. This means a user can
242250
always override a ``.env`` value by exporting the variable in the shell.
243251
"""
244252
# 1. Load .env file (does NOT mutate os.environ)
245-
dotenv_path = _find_dotenv()
253+
if env_file is not None:
254+
dotenv_path = Path(env_file)
255+
if not dotenv_path.is_file():
256+
raise FileNotFoundError(f".env file not found: {dotenv_path}")
257+
else:
258+
dotenv_path = _find_dotenv()
246259
dotenv_vars: Dict[str, Optional[str]] = (
247260
dotenv_values(dotenv_path) if dotenv_path else {}
248261
)
@@ -319,6 +332,7 @@ def from_env(
319332
* ``timeout`` – request timeout in seconds
320333
* ``enable_layout`` – whether to run layout detection
321334
* ``log_level`` – logging level (DEBUG / INFO / …)
335+
* ``env_file`` – explicit path to a ``.env`` file
322336
323337
Any other keyword is silently ignored so that callers can safely
324338
forward ``**kwargs`` without worrying about typos crashing the SDK.
@@ -349,7 +363,8 @@ def from_env(
349363
data = {}
350364

351365
# 2. Environment variable overrides
352-
env_data = _collect_env_overrides()
366+
env_file = overrides.pop("env_file", None)
367+
env_data = _collect_env_overrides(env_file=env_file)
353368
if env_data:
354369
_deep_merge(data, env_data)
355370

glmocr/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pipeline:
3232
#
3333
# Note: When maas.enabled=true, the ocr_api and layout settings below are ignored.
3434
maas:
35-
enabled: false # Set to true to use MaaS mode
35+
enabled: true # Set to true to use MaaS mode
3636
api_url: https://open.bigmodel.cn/api/paas/v4/layout_parsing
3737
model: glm-ocr
3838
api_key: null # Required! Get from https://open.bigmodel.cn

glmocr/maas_client.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@
3434
profiler = get_profiler(__name__)
3535

3636

37+
class MissingApiKeyError(ValueError):
38+
"""Raised when MaaS mode is active but no API key is configured."""
39+
40+
def __init__(self) -> None:
41+
super().__init__(
42+
"MaaS mode requires an API key.\n"
43+
" Option 1 (env var): export GLMOCR_API_KEY=sk-xxx\n"
44+
" Option 2 (CLI flag): glmocr parse image.png --api-key sk-xxx\n"
45+
' Option 3 (Python): GlmOcr(api_key="sk-xxx")\n'
46+
" Get your key at: https://open.bigmodel.cn"
47+
)
48+
49+
3750
# Default MaaS API endpoint
3851
DEFAULT_MAAS_URL = "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
3952
DEFAULT_MAAS_MODEL = "glm-ocr"
@@ -103,10 +116,7 @@ def __init__(self, config: "MaaSApiConfig"):
103116
# Authentication
104117
self.api_key = config.api_key or os.getenv("GLMOCR_API_KEY")
105118
if not self.api_key:
106-
raise ValueError(
107-
"API key is required for MaaS mode. "
108-
"Set it in config.yaml or GLMOCR_API_KEY environment variable."
109-
)
119+
raise MissingApiKeyError()
110120

111121
# SSL verification
112122
self.verify_ssl = config.verify_ssl

glmocr/tests/test_unit.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -341,20 +341,19 @@ def test_maas_config_defaults(self):
341341
from glmocr.config import MaaSApiConfig
342342

343343
config = MaaSApiConfig()
344-
assert config.enabled is False
344+
assert config.enabled is True
345345
assert config.api_url == "https://open.bigmodel.cn/api/paas/v4/layout_parsing"
346346
assert config.model == "glm-ocr"
347347
assert config.verify_ssl is True
348348

349349
def test_maas_client_requires_api_key(self):
350-
"""MaaSClient raises error when API key is missing."""
351-
from glmocr.maas_client import MaaSClient
350+
"""MaaSClient raises MissingApiKeyError when API key is missing."""
351+
from glmocr.maas_client import MaaSClient, MissingApiKeyError
352352
from glmocr.config import MaaSApiConfig
353353

354354
config = MaaSApiConfig(api_key=None)
355-
with pytest.raises(ValueError) as exc:
355+
with pytest.raises(MissingApiKeyError):
356356
MaaSClient(config)
357-
assert "API key is required" in str(exc.value)
358357

359358
def test_maas_client_init_with_api_key(self):
360359
"""MaaSClient initializes correctly with API key."""
@@ -506,7 +505,7 @@ def test_config_maas_in_pipeline(self):
506505

507506
config = PipelineConfig()
508507
assert hasattr(config, "maas")
509-
assert config.maas.enabled is False
508+
assert config.maas.enabled is True
510509

511510

512511
# ═══════════════════════════════════════════════════════════════════════
@@ -707,7 +706,7 @@ def test_defaults_when_nothing_set(self, monkeypatch):
707706
monkeypatch.setattr("glmocr.config._find_dotenv", lambda: None)
708707

709708
cfg = GlmOcrConfig.from_env()
710-
assert cfg.pipeline.maas.enabled is False
709+
assert cfg.pipeline.maas.enabled is True
711710
assert cfg.logging.level == "INFO"
712711

713712
def test_overrides_win_over_env(self, monkeypatch):

0 commit comments

Comments
 (0)