Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions examples/detect/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
pipeline:
enable_layout: true
layout:
model_dir: PaddlePaddle/PP-DocLayoutV3_safetensors
label_task_mapping:
ocr:
- abstract
- aside_text
- content
- doc_title
- figure_title
- footnote
- header
- footer
- number
- paragraph_title
- reference
- reference_content
- text
- vision_footnote
detect:
- table # bbox returned, OCR skipped, content: ""
abandon:
- algorithm
- chart
- formula
- formula_number
- image
- seal
maas:
enabled: false
ocr_api:
api_host: 127.0.0.1
api_port: 1234
api_scheme: http
model: glm-ocr
page_loader:
max_tokens: 8192
Binary file added examples/detect/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
40 changes: 40 additions & 0 deletions examples/detect/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Detect example — returns table bounding boxes without running table OCR.

Run:
python examples/detect/run.py

Drop example.png into examples/detect/ before running.
"""

from __future__ import annotations

import json
from pathlib import Path

from glmocr.api import GlmOcr

HERE = Path(__file__).resolve().parent
PNG = HERE / "example.png"
CONFIG = HERE / "config.yaml"


def main() -> None:
if not PNG.exists():
raise FileNotFoundError(f"Place your image at: {PNG}")

with GlmOcr(config_path=str(CONFIG), mode="selfhosted") as parser:
result = parser.parse(str(PNG))

regions = result.json_result[0] if result.json_result else []
detect_regions = [r for r in regions if r.get("task_type") == "detect" or (r.get("label") == "table" and r.get("content") == "")]
print(f"\n--- {len(detect_regions)} detect region(s) ---")
for r in detect_regions:
bbox = r.get("bbox_2d")
print(f" label={r.get('label')} bbox_2d={bbox} content={r.get('content')!r}")

print("\nFull json_result:")
print(json.dumps(result.to_dict(), indent=2, ensure_ascii=False))


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions glmocr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ class LayoutConfig(_BaseConfig):
layout_unclip_ratio: Optional[Any] = None
layout_merge_bboxes_mode: Union[str, Dict[int, str]] = "large"
label_task_mapping: Optional[Dict[str, Any]] = None
id2label: Optional[Dict[int, str]] = None

@field_validator("device")
@classmethod
Expand Down
4 changes: 2 additions & 2 deletions glmocr/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def vlm_recognition_thread() -> None:
except queue.Empty:
if processing_complete and len(futures) == 0:
for region, task_type, page_idx in pending_skip:
region["content"] = None
region["content"] = "" if task_type == "detect" else None
with state.results_lock:
state.recognition_results.append((page_idx, region))
maybe_notify_ready_units(page_idx)
Expand All @@ -480,7 +480,7 @@ def vlm_recognition_thread() -> None:
continue
if item_type == "region":
cropped_image, region, task_type, page_idx = data
if task_type == "skip":
if task_type in ("skip", "detect"):
pending_skip.append((region, task_type, page_idx))
else:
req = self.page_loader.build_request_from_image(
Expand Down
4 changes: 2 additions & 2 deletions glmocr/postprocess/result_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ def process(self, grouped_results: List[List[Dict]]) -> Tuple[str, str]:
result["native_label"],
)

# Skip empty content (after formatting)
# Skip empty content (after formatting), but keep detect regions
content = result.get("content")
if isinstance(content, str) and content.strip() == "":
if isinstance(content, str) and content.strip() == "" and result.get("task_type") != "detect":
continue

# Update index
Expand Down