zai-org · surajama · Mar 20, 2026
diff --git a/examples/detect/config.yaml b/examples/detect/config.yaml
@@ -0,0 +1,38 @@
+pipeline:
+  enable_layout: true
+  layout:
+    model_dir: PaddlePaddle/PP-DocLayoutV3_safetensors
+    label_task_mapping:
+      ocr:
+        - abstract
+        - aside_text
+        - content
+        - doc_title
+        - figure_title
+        - footnote
+        - header
+        - footer
+        - number
+        - paragraph_title
+        - reference
+        - reference_content
+        - text
+        - vision_footnote
+      detect:
+        - table        # bbox returned, OCR skipped, content: ""
+      abandon:
+        - algorithm
+        - chart
+        - formula
+        - formula_number
+        - image
+        - seal
+  maas:
+    enabled: false
+  ocr_api:
+    api_host: 127.0.0.1
+    api_port: 1234
+    api_scheme: http
+    model: glm-ocr
+  page_loader:
+    max_tokens: 8192
diff --git a/examples/detect/example.png b/examples/detect/example.png
diff --git a/examples/detect/run.py b/examples/detect/run.py
@@ -0,0 +1,40 @@
+"""Detect example — returns table bounding boxes without running table OCR.
+
+Run:
+    python examples/detect/run.py
+
+Drop example.png into examples/detect/ before running.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from glmocr.api import GlmOcr
+
+HERE = Path(__file__).resolve().parent
+PNG = HERE / "example.png"
+CONFIG = HERE / "config.yaml"
+
+
+def main() -> None:
+    if not PNG.exists():
+        raise FileNotFoundError(f"Place your image at: {PNG}")
+
+    with GlmOcr(config_path=str(CONFIG), mode="selfhosted") as parser:
+        result = parser.parse(str(PNG))
+
+    regions = result.json_result[0] if result.json_result else []
+    detect_regions = [r for r in regions if r.get("task_type") == "detect" or (r.get("label") == "table" and r.get("content") == "")]
+    print(f"\n--- {len(detect_regions)} detect region(s) ---")
+    for r in detect_regions:
+        bbox = r.get("bbox_2d")
+        print(f"  label={r.get('label')}  bbox_2d={bbox}  content={r.get('content')!r}")
+
+    print("\nFull json_result:")
+    print(json.dumps(result.to_dict(), indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/glmocr/config.py b/glmocr/config.py
@@ -202,6 +202,7 @@ class LayoutConfig(_BaseConfig):
     layout_unclip_ratio: Optional[Any] = None
     layout_merge_bboxes_mode: Union[str, Dict[int, str]] = "large"
     label_task_mapping: Optional[Dict[str, Any]] = None
+    id2label: Optional[Dict[int, str]] = None
 
     @field_validator("device")
     @classmethod

diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py
@@ -465,7 +465,7 @@ def vlm_recognition_thread() -> None:
                     except queue.Empty:
                         if processing_complete and len(futures) == 0:
                             for region, task_type, page_idx in pending_skip:
-                                region["content"] = None
+                                region["content"] = "" if task_type == "detect" else None
                                 with state.results_lock:
                                     state.recognition_results.append((page_idx, region))
                                 maybe_notify_ready_units(page_idx)
@@ -480,7 +480,7 @@ def vlm_recognition_thread() -> None:
                         continue
                     if item_type == "region":
                         cropped_image, region, task_type, page_idx = data
-                        if task_type == "skip":
+                        if task_type in ("skip", "detect"):
                             pending_skip.append((region, task_type, page_idx))
                         else:
                             req = self.page_loader.build_request_from_image(

diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py
@@ -170,9 +170,9 @@ def process(self, grouped_results: List[List[Dict]]) -> Tuple[str, str]:
                         result["native_label"],
                     )
 
-                    # Skip empty content (after formatting)
+                    # Skip empty content (after formatting), but keep detect regions
                     content = result.get("content")
-                    if isinstance(content, str) and content.strip() == "":
+                    if isinstance(content, str) and content.strip() == "" and result.get("task_type") != "detect":
                         continue
 
                     # Update index