diff --git a/examples/detect/config.yaml b/examples/detect/config.yaml new file mode 100644 index 0000000..252142c --- /dev/null +++ b/examples/detect/config.yaml @@ -0,0 +1,38 @@ +pipeline: + enable_layout: true + layout: + model_dir: PaddlePaddle/PP-DocLayoutV3_safetensors + label_task_mapping: + ocr: + - abstract + - aside_text + - content + - doc_title + - figure_title + - footnote + - header + - footer + - number + - paragraph_title + - reference + - reference_content + - text + - vision_footnote + detect: + - table # bbox returned, OCR skipped, content: "" + abandon: + - algorithm + - chart + - formula + - formula_number + - image + - seal + maas: + enabled: false + ocr_api: + api_host: 127.0.0.1 + api_port: 1234 + api_scheme: http + model: glm-ocr + page_loader: + max_tokens: 8192 diff --git a/examples/detect/example.png b/examples/detect/example.png new file mode 100644 index 0000000..c55e09f Binary files /dev/null and b/examples/detect/example.png differ diff --git a/examples/detect/run.py b/examples/detect/run.py new file mode 100644 index 0000000..8146ea7 --- /dev/null +++ b/examples/detect/run.py @@ -0,0 +1,40 @@ +"""Detect example — returns table bounding boxes without running table OCR. + +Run: + python examples/detect/run.py + +Drop example.png into examples/detect/ before running. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from glmocr.api import GlmOcr + +HERE = Path(__file__).resolve().parent +PNG = HERE / "example.png" +CONFIG = HERE / "config.yaml" + + +def main() -> None: + if not PNG.exists(): + raise FileNotFoundError(f"Place your image at: {PNG}") + + with GlmOcr(config_path=str(CONFIG), mode="selfhosted") as parser: + result = parser.parse(str(PNG)) + + regions = result.json_result[0] if result.json_result else [] + detect_regions = [r for r in regions if r.get("task_type") == "detect" or (r.get("label") == "table" and r.get("content") == "")] + print(f"\n--- {len(detect_regions)} detect region(s) ---") + for r in detect_regions: + bbox = r.get("bbox_2d") + print(f" label={r.get('label')} bbox_2d={bbox} content={r.get('content')!r}") + + print("\nFull json_result:") + print(json.dumps(result.to_dict(), indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/glmocr/config.py b/glmocr/config.py index 38fa92c..242ad44 100644 --- a/glmocr/config.py +++ b/glmocr/config.py @@ -202,6 +202,7 @@ class LayoutConfig(_BaseConfig): layout_unclip_ratio: Optional[Any] = None layout_merge_bboxes_mode: Union[str, Dict[int, str]] = "large" label_task_mapping: Optional[Dict[str, Any]] = None + id2label: Optional[Dict[int, str]] = None @field_validator("device") @classmethod diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index 23b91a0..ee255a3 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -465,7 +465,7 @@ def vlm_recognition_thread() -> None: except queue.Empty: if processing_complete and len(futures) == 0: for region, task_type, page_idx in pending_skip: - region["content"] = None + region["content"] = "" if task_type == "detect" else None with state.results_lock: state.recognition_results.append((page_idx, region)) maybe_notify_ready_units(page_idx) @@ -480,7 +480,7 @@ def vlm_recognition_thread() -> None: continue if item_type == "region": cropped_image, region, task_type, page_idx = data - if task_type == "skip": + if task_type in ("skip", "detect"): pending_skip.append((region, task_type, page_idx)) else: req = self.page_loader.build_request_from_image( diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index 46ff2d7..78b168a 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -170,9 +170,9 @@ def process(self, grouped_results: List[List[Dict]]) -> Tuple[str, str]: result["native_label"], ) - # Skip empty content (after formatting) + # Skip empty content (after formatting), but keep detect regions content = result.get("content") - if isinstance(content, str) and content.strip() == "": + if isinstance(content, str) and content.strip() == "" and result.get("task_type") != "detect": continue # Update index