From 686e9d2c5f290912fc2b1b6749631d70f03328bb Mon Sep 17 00:00:00 2001 From: VooDisss Date: Mon, 30 Mar 2026 23:33:34 +0300 Subject: [PATCH 1/7] Add printed page number metadata to SDK JSON output Route PP-DocLayoutV3 'number' regions through OCR instead of dropping them, then extract printed page number evidence from recognized number blocks in the result formatter. Preserve the feature as number-only, support both numeric and Roman labels, and derive three explicit output layers: page_number_candidates, document_page_numbering, and page_metadata. Keep the general json_result contract lean by stripping transient layout_index and layout_score fields from final blocks while retaining native_label, and wrap saved paper.json only when real printed-page data exists. Also expose detect_printed_page_numbers through config, constructor, and environment overrides, align MaaS output with self-hosted behavior, add contract-focused tests for legacy-vs-wrapped save behavior and lean json_result output, and document the exact save contract in the English and Chinese READMEs. --- README.md | 60 ++++ README_zh.md | 59 ++++ glmocr/api.py | 25 ++ glmocr/config.py | 6 + glmocr/config.yaml | 3 +- glmocr/parser_result/base.py | 28 ++ glmocr/parser_result/pipeline_result.py | 9 + glmocr/pipeline/pipeline.py | 6 + glmocr/postprocess/result_formatter.py | 208 ++++++++++++- glmocr/tests/test_unit.py | 385 ++++++++++++++++++++++++ 10 files changed, 787 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7635611..2ea7807 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,11 @@ with GlmOcr() as parser: print(result.json_result) result.save() +# Extract printed page numbers from PP-DocLayoutV3 `number` regions +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) + # Place layout model on CPU (useful when GPU is reserved for OCR) with GlmOcr(layout_device="cpu") as parser: result = parser.parse("image.png") @@ -302,6 +307,7 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false # Layout model device placement layout: @@ -310,6 +316,23 @@ pipeline: See [config.yaml](glmocr/config.yaml) for all options. +Printed page number detection can be enabled in three ways: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + ### Output Formats Here are two examples of output formats: @@ -320,6 +343,43 @@ Here are two examples of output formats: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +When printed page detection is enabled and printed-page data is actually found, +saved `paper.json` is wrapped as a top-level object and includes: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + - Markdown ```markdown diff --git a/README_zh.md b/README_zh.md index 8a2f3fe..71504f0 100644 --- a/README_zh.md +++ b/README_zh.md @@ -214,6 +214,11 @@ with GlmOcr() as parser: result = parser.parse("image.png") print(result.json_result) result.save() + +# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码 +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) ``` #### Flask 服务 @@ -287,10 +292,28 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false ``` 更多选项请参考 [config.yaml](glmocr/config.yaml)。 +印刷页码检测支持以下三种启用方式: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + ### 输出格式 这里给出两种输出格式示例: @@ -301,6 +324,42 @@ pipeline: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +启用印刷页码检测且实际检测到印刷页码数据时,保存的 `paper.json` 会变成顶层对象,并包含: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + - Markdown ```markdown diff --git a/glmocr/api.py b/glmocr/api.py index dcf5570..ffb218a 100644 --- a/glmocr/api.py +++ b/glmocr/api.py @@ -84,6 +84,7 @@ def __init__( ocr_api_port: Optional[int] = None, cuda_visible_devices: Optional[str] = None, layout_device: Optional[str] = None, + detect_printed_page_numbers: Optional[bool] = None, **kwargs: Any, ): """Initialize GlmOcr. @@ -130,6 +131,7 @@ def __init__( ocr_api_port=ocr_api_port, cuda_visible_devices=cuda_visible_devices, layout_device=layout_device, + detect_printed_page_numbers=detect_printed_page_numbers, **kwargs, ) # Apply logging config for API/SDK usage. @@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result( { "index": region.get("index", 0), "label": region.get("label", "text"), + "native_label": region.get("label", "text"), "content": region.get("content", ""), "bbox_2d": bbox, + "layout_index": region.get("index", 0), + "layout_score": float(region.get("score") or 0.0), } ) json_result.append(page_result) @@ -460,12 +465,32 @@ def _maas_response_to_pipeline_result( source, ) + page_metadata = None + page_number_candidates = None + document_page_numbering = None + if self.config_model.pipeline.result_formatter.detect_printed_page_numbers: + from glmocr.postprocess import ResultFormatter + + formatter = ResultFormatter(self.config_model.pipeline.result_formatter) + ( + page_number_candidates, + document_page_numbering, + page_metadata, + ) = formatter.extract_printed_page_data(json_result) + + from glmocr.postprocess import ResultFormatter + + ResultFormatter._strip_layout_metadata(json_result) + # Create PipelineResult result = PipelineResult( json_result=json_result, markdown_result=markdown_result, original_images=[source], image_files=image_files or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) # Store additional MaaS response data diff --git a/glmocr/config.py b/glmocr/config.py index 789e206..e348db5 100644 --- a/glmocr/config.py +++ b/glmocr/config.py @@ -48,6 +48,8 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]: "LAYOUT_CUDA_VISIBLE_DEVICES": "pipeline.layout.cuda_visible_devices", # Explicit device for layout model: "cpu", "cuda", "cuda:0", etc. "LAYOUT_DEVICE": "pipeline.layout.device", + # Result formatter + "DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers", # Logging "LOG_LEVEL": "logging.level", } @@ -175,6 +177,7 @@ class ResultFormatterConfig(_BaseConfig): enable_merge_formula_numbers: bool = True enable_merge_text_blocks: bool = True enable_format_bullet_points: bool = True + detect_printed_page_numbers: bool = False label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict) @@ -260,6 +263,8 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any: # Boolean fields if dotted_path == "pipeline.maas.enabled": return raw.strip().lower() in ("maas", "true", "1", "yes") + if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers": + return raw.strip().lower() in ("true", "1", "yes", "on") # Integer fields if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")): return int(raw) @@ -429,6 +434,7 @@ def from_env( "mode": "pipeline.maas.enabled", "timeout": "pipeline.maas.request_timeout", "log_level": "logging.level", + "detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers", # Self-hosted OCR API "ocr_api_host": "pipeline.ocr_api.api_host", "ocr_api_port": "pipeline.ocr_api.api_port", diff --git a/glmocr/config.yaml b/glmocr/config.yaml index 8c287fe..5679570 100644 --- a/glmocr/config.yaml +++ b/glmocr/config.yaml @@ -164,6 +164,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -256,6 +257,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -274,7 +276,6 @@ pipeline: abandon: - header - footer - - number - footnote - aside_text - reference diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py index 0b996c2..e56f480 100644 --- a/glmocr/parser_result/base.py +++ b/glmocr/parser_result/base.py @@ -30,6 +30,9 @@ def __init__( original_images: Optional[List[str]] = None, image_files: Optional[Dict[str, Any]] = None, raw_json_result: Optional[list] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -41,6 +44,9 @@ def __init__( regions, to be saved under ``imgs/`` during :meth:`save`. raw_json_result: Raw model output before post-processing; saved as ``{name}_model.json`` alongside the final result. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ if isinstance(json_result, str): try: @@ -56,6 +62,9 @@ def __init__( ] self.image_files = image_files self.raw_json_result = raw_json_result + self.page_metadata = page_metadata + self.page_number_candidates = page_number_candidates + self.document_page_numbering = document_page_numbering @abstractmethod def save( @@ -88,6 +97,19 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: json_data = json.loads(json_data) except json.JSONDecodeError: pass + + has_printed_page_data = bool(self.page_metadata) or bool( + self.page_number_candidates + ) or self.document_page_numbering is not None + + if has_printed_page_data: + json_data = { + "json_result": json_data, + "page_metadata": self.page_metadata if self.page_metadata is not None else [], + "page_number_candidates": self.page_number_candidates if self.page_number_candidates is not None else [], + "document_page_numbering": self.document_page_numbering, + } + with open(json_file, "w", encoding="utf-8") as f: if isinstance(json_data, (dict, list)): json.dump(json_data, f, ensure_ascii=False, indent=2) @@ -134,6 +156,12 @@ def to_dict(self) -> dict: "markdown_result": self.markdown_result or "", "original_images": self.original_images, } + if self.page_metadata is not None: + d["page_metadata"] = self.page_metadata + if self.page_number_candidates is not None: + d["page_number_candidates"] = self.page_number_candidates + if self.document_page_numbering is not None: + d["document_page_numbering"] = self.document_page_numbering # Include optional metadata set by MaaS mode. for attr in ("_usage", "_data_info", "_error"): val = getattr(self, attr, None) diff --git a/glmocr/parser_result/pipeline_result.py b/glmocr/parser_result/pipeline_result.py index 800084c..7a9e9d9 100644 --- a/glmocr/parser_result/pipeline_result.py +++ b/glmocr/parser_result/pipeline_result.py @@ -26,6 +26,9 @@ def __init__( image_files: Optional[dict] = None, raw_json_result: Optional[list] = None, layout_vis_images: Optional[Dict[int, Any]] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -38,6 +41,9 @@ def __init__( raw_json_result: Raw model output before post-processing (optional). layout_vis_images: Mapping of ``page_idx`` → PIL Image for layout visualization; saved to ``layout_vis/`` during :meth:`save`. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ super().__init__( json_result=json_result, @@ -45,6 +51,9 @@ def __init__( original_images=original_images, image_files=image_files, raw_json_result=raw_json_result, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) self.layout_vis_images = layout_vis_images diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index 699f48d..b1c48ac 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -362,6 +362,9 @@ def _emit_results( grouped, cropped_images=cropped_images or None, ) + page_metadata = self.result_formatter.page_metadata + page_number_candidates = self.result_formatter.page_number_candidates + document_page_numbering = self.result_formatter.document_page_numbering vis_images = {} for pi in page_indices: @@ -378,6 +381,9 @@ def _emit_results( image_files=image_files or None, raw_json_result=raw_json, layout_vis_images=vis_images or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) built.add(u) if preserve_order: diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index 8b31d5d..c385d6b 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -12,10 +12,11 @@ from __future__ import annotations +import collections import re import json from copy import deepcopy -from typing import TYPE_CHECKING, List, Dict, Tuple, Any +from typing import TYPE_CHECKING, List, Dict, Tuple, Any, Optional try: # Optional dependency for better English word validation quality. from wordfreq import zipf_frequency @@ -70,6 +71,10 @@ def __init__(self, config: "ResultFormatterConfig"): self.enable_merge_formula_numbers = config.enable_merge_formula_numbers self.enable_merge_text_blocks = config.enable_merge_text_blocks self.enable_format_bullet_points = config.enable_format_bullet_points + self.detect_printed_page_numbers = config.detect_printed_page_numbers + self.page_metadata: Optional[List[Dict[str, Any]]] = None + self.page_number_candidates: Optional[List[Dict[str, Any]]] = None + self.document_page_numbering: Optional[Dict[str, Any]] = None # ========================================================================= # OCR-only mode @@ -160,6 +165,10 @@ def process( (json_str, markdown_str, image_files) where *image_files* maps ``filename`` → PIL Image for the caller to persist. """ + self.page_metadata = None + self.page_number_candidates = None + self.document_page_numbering = None + json_final_results = [] with profiler.measure("format_regions"): @@ -173,6 +182,10 @@ def process( for item in sorted_results: result = deepcopy(item) + result["layout_index"] = result.get("layout_index", result.get("index", 0)) + result["layout_score"] = float( + result.get("layout_score", result.get("score") or 0.0) + ) result["native_label"] = result.get("label", "text") # Map labels @@ -215,6 +228,15 @@ def process( json_final_results.append(json_page_results) + if self.detect_printed_page_numbers: + ( + self.page_number_candidates, + self.document_page_numbering, + self.page_metadata, + ) = self.extract_printed_page_data(json_final_results) + + self._strip_layout_metadata(json_final_results) + # Generate markdown results and resolve image regions image_files: Dict[str, Any] = {} image_counter = 0 @@ -251,6 +273,190 @@ def process( return json_str, markdown_str, image_files + def extract_printed_page_data( + self, + pages: List[List[Dict[str, Any]]], + ) -> Tuple[ + List[Dict[str, Any]], + Optional[Dict[str, Any]], + List[Dict[str, Any]], + ]: + """Extract number candidates and derived printed page metadata.""" + candidates = self._extract_page_number_candidates(pages) + document_page_numbering = self._infer_document_page_numbering(candidates) + page_metadata = self._build_printed_page_metadata(candidates) + return candidates, document_page_numbering, page_metadata + + def _extract_page_number_candidates( + self, + pages: List[List[Dict[str, Any]]], + ) -> List[Dict[str, Any]]: + """Extract raw `number` candidates for printed page inference.""" + candidates: List[Dict[str, Any]] = [] + for page_index, page_blocks in enumerate(pages): + for block in page_blocks: + candidate = self._build_page_number_candidate(page_index, block) + if candidate is not None: + candidates.append(candidate) + return candidates + + def _build_page_number_candidate( + self, + page_index: int, + block: Dict[str, Any], + ) -> Optional[Dict[str, Any]]: + """Build a normalized page-number candidate from one layout block.""" + if block.get("native_label") != "number": + return None + + bbox = block.get("bbox_2d") + if not isinstance(bbox, list) or len(bbox) != 4: + return None + + label = self._normalize_printed_page_label(block.get("content")) + if label is None: + return None + + x1, y1, x2, y2 = bbox + width = x2 - x1 + height = y2 - y1 + if width <= 0 or height <= 0 or width > 140 or height > 120: + return None + if not self._is_margin_candidate(x1, y1, x2, y2): + return None + + return { + "page_index": page_index, + "label": "number", + "content": label, + "layout_index": block.get("layout_index", block.get("index", 0)), + "bbox_2d": bbox, + "layout_score": float(block.get("layout_score") or 0.0), + "numeric_like": label.isdigit(), + "roman_like": self._is_roman_like(label), + } + + @staticmethod + def _is_margin_candidate(x1: int, y1: int, x2: int, y2: int) -> bool: + """Return whether a candidate lies in a plausible page-margin folio area.""" + in_margin_band = y1 <= 120 or y2 >= 880 + in_outer_margin = x1 <= 180 or x2 >= 820 + return in_margin_band and in_outer_margin + + @staticmethod + def _is_roman_like(content: str) -> bool: + """Check whether a label looks like a Roman numeral folio.""" + return bool(re.fullmatch(r"(?i)[ivxlcdm]+", content)) + + def _infer_document_page_numbering( + self, + candidates: List[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + """Infer document-level numbering from number-only candidates.""" + if not candidates: + return None + + best_candidates = self._best_candidates_by_page(candidates) + page_count = len(best_candidates) + numeric_candidates = [c for c in best_candidates if c["numeric_like"]] + roman_candidates = [c for c in best_candidates if c["roman_like"]] + + if numeric_candidates: + offsets = collections.Counter( + int(c["content"]) - int(c["page_index"]) for c in numeric_candidates + ) + page_offset, support = offsets.most_common(1)[0] + return { + "strategy": "visual_sequence", + "confidence": round(support / max(1, page_count), 3), + "sequence_type": "arabic", + "page_offset": page_offset, + "candidate_pages": page_count, + } + + if roman_candidates: + return { + "strategy": "visual_sequence", + "confidence": round(len(roman_candidates) / max(1, page_count), 3), + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": len(roman_candidates), + } + + return None + + def _build_printed_page_metadata( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Build per-page printed page metadata from selected candidates.""" + if not candidates: + return [] + + metadata: List[Dict[str, Any]] = [] + for candidate in self._best_candidates_by_page(candidates): + metadata.append( + { + "page_index": candidate["page_index"], + "printed_page_label": candidate["content"], + "printed_page_block_index": candidate["layout_index"], + "printed_page_bbox_2d": candidate["bbox_2d"], + "printed_page_confidence": candidate["layout_score"], + } + ) + return metadata + + def _best_candidates_by_page( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Select the strongest candidate per page.""" + by_page: Dict[int, List[Dict[str, Any]]] = collections.defaultdict(list) + for candidate in candidates: + by_page[int(candidate["page_index"])].append(candidate) + return [ + min(by_page[page_index], key=self._candidate_sort_key) + for page_index in sorted(by_page) + ] + + @staticmethod + def _candidate_sort_key(block: Dict[str, Any]) -> tuple[int, int, int, int]: + """Prefer blocks nearest to outer top/bottom page margins.""" + bbox = block.get("bbox_2d") or [0, 0, 1000, 1000] + x1, y1, x2, y2 = bbox + top_distance = y1 + bottom_distance = 1000 - y2 + edge_distance = min(top_distance, bottom_distance) + side_distance = min(x1, 1000 - x2) + return ( + edge_distance, + side_distance, + -int(block.get("layout_score", 0) * 1000), + int(block.get("layout_index", block.get("index", 0))), + ) + + @staticmethod + def _normalize_printed_page_label(content: Any) -> Optional[str]: + """Normalize OCR text from a printed page-number candidate.""" + if not isinstance(content, str): + return None + label = content.strip() + if not label or len(label) > 12: + return None + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9\-./]*", label): + return None + if not (re.search(r"\d", label) or ResultFormatter._is_roman_like(label)): + return None + return label + + @staticmethod + def _strip_layout_metadata(pages: List[List[Dict[str, Any]]]) -> None: + """Remove broad layout-only metadata from final JSON blocks.""" + for page in pages: + for block in page: + block.pop("layout_index", None) + block.pop("layout_score", None) + # ========================================================================= # Content handling # ========================================================================= diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 62ba50d..7d55353 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -1,6 +1,7 @@ """Unit tests for glmocr (no external services required).""" import json +import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -24,6 +25,14 @@ def test_config_to_dict(self): cfg = load_config().to_dict() assert isinstance(cfg, dict) + def test_default_config_routes_number_to_text_ocr(self): + """Default SDK config preserves PP-DocLayoutV3 number regions for OCR.""" + from glmocr.config import load_config + + cfg = load_config() + text_labels = cfg.pipeline.layout.label_task_mapping["text"] + assert "number" in text_labels + class TestLayoutDeviceUnit: """Unit tests for layout device selection and config plumbing (mocked).""" @@ -494,6 +503,164 @@ def test_result_formatter_clean_content(self): cleaned = formatter._clean_content("Hello....World") assert "....." not in cleaned + def test_result_formatter_feature_off_keeps_json_result_lean(self): + """Feature disabled does not leak broad layout metadata into json_result.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter(ResultFormatterConfig()) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert parsed[0][0]["native_label"] == "number" + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + + def test_result_formatter_extracts_page_number_data(self): + """Formatter extracts printed page data from number blocks.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.88 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + + def test_result_formatter_ignores_non_margin_number_blocks(self): + """Formatter ignores number blocks that are not in page margins.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [400, 400, 428, 430], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + assert formatter.page_metadata == [] + assert formatter.page_number_candidates == [] + assert formatter.document_page_numbering is None + + def test_result_formatter_accepts_roman_number_candidates(self): + """Formatter preserves Roman numeral number candidates.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "iv", + "bbox_2d": [944, 12, 972, 42], + "score": 0.75, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.75 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "iv", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.75, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "iv", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.75, + "numeric_like": False, + "roman_like": True, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": 1, + } + class TestMaaSClient: """Tests for MaaSClient.""" @@ -845,6 +1012,17 @@ def test_no_env_returns_empty(self, monkeypatch): assert _collect_env_overrides() == {} + def test_detect_printed_page_numbers_env_var(self, monkeypatch): + """Printed page detection can be enabled via environment variable.""" + from glmocr.config import _collect_env_overrides + + monkeypatch.setenv("GLMOCR_DETECT_PRINTED_PAGE_NUMBERS", "true") + overrides = _collect_env_overrides() + assert ( + overrides["pipeline"]["result_formatter"]["detect_printed_page_numbers"] + is True + ) + class TestFromEnv: """Tests for GlmOcrConfig.from_env() – full priority chain.""" @@ -981,6 +1159,107 @@ def test_to_json_unicode_preserved(self): # ensure_ascii=False by default → raw CJK characters assert "中文测试" in s + def test_to_dict_includes_printed_page_fields(self): + r = self._make_result( + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + d = r.to_dict() + assert d["page_metadata"][0]["printed_page_label"] == "12" + assert d["page_number_candidates"][0]["label"] == "number" + assert d["document_page_numbering"]["page_offset"] == 12 + + def test_save_wraps_json_with_printed_page_fields(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert "json_result" in saved + assert saved["page_metadata"][0]["printed_page_label"] == "12" + assert saved["page_number_candidates"][0]["label"] == "number" + assert saved["document_page_numbering"]["page_offset"] == 12 + + def test_save_keeps_legacy_json_shape_without_printed_page_data(self): + r = self._make_result(original_images=["paper.pdf"]) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + + def test_save_keeps_legacy_json_shape_when_detection_has_no_hits(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[], + page_number_candidates=[], + document_page_numbering=None, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + def test_repr(self): r = self._make_result() assert "PipelineResult" in repr(r) @@ -1251,6 +1530,93 @@ def test_parse_stream_selfhosted_delegates(self): preserve_order=True, ) + def test_maas_response_includes_printed_page_metadata_when_enabled(self): + """MaaS conversion derives printed page data from number blocks.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig, ResultFormatterConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + parser.config_model.pipeline.result_formatter = ResultFormatterConfig( + detect_printed_page_numbers=True + ) + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + assert result.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert result.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + assert result.page_metadata[0]["printed_page_label"] == "12" + + def test_maas_response_feature_off_keeps_json_result_lean(self): + """MaaS conversion does not leak broad layout metadata when feature is off.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + block = result.json_result[0][0] + assert block["native_label"] == "number" + assert "layout_index" not in block + assert "layout_score" not in block + class TestGlmOcrConstructor: """Tests for GlmOcr.__init__ kwarg handling (config assembly only).""" @@ -1308,6 +1674,25 @@ def test_selfhosted_model_kwarg_is_forwarded_to_ocr_api(self, monkeypatch): assert parser.config_model.pipeline.ocr_api.model == "glm-ocr" parser.close() + def test_detect_printed_page_numbers_kwarg_is_forwarded(self, monkeypatch): + """Public constructor flag enables printed page detection in config.""" + from glmocr.config import _ENV_MAP, ENV_PREFIX + + for suffix in _ENV_MAP: + monkeypatch.delenv(f"{ENV_PREFIX}{suffix}", raising=False) + monkeypatch.setattr("glmocr.config._find_dotenv", lambda: None) + + with patch("glmocr.maas_client.MaaSClient") as mock_maas: + mock_maas.return_value.start = MagicMock() + from glmocr.api import GlmOcr + + parser = GlmOcr(api_key="sk-test", detect_printed_page_numbers=True) + assert ( + parser.config_model.pipeline.result_formatter.detect_printed_page_numbers + is True + ) + parser.close() + class TestOCRClientOllamaConfig: """Tests for OCRClient initialization with Ollama api_mode.""" From 089258864455a77df99b3d996ecce20680742fcc Mon Sep 17 00:00:00 2001 From: VooDisss Date: Mon, 30 Mar 2026 23:40:57 +0300 Subject: [PATCH 2/7] Apply pre-commit formatting fixes --- glmocr/parser_result/base.py | 18 +++++++++++++----- glmocr/postprocess/result_formatter.py | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py index e56f480..ae8963c 100644 --- a/glmocr/parser_result/base.py +++ b/glmocr/parser_result/base.py @@ -98,15 +98,23 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: except json.JSONDecodeError: pass - has_printed_page_data = bool(self.page_metadata) or bool( - self.page_number_candidates - ) or self.document_page_numbering is not None + has_printed_page_data = ( + bool(self.page_metadata) + or bool(self.page_number_candidates) + or self.document_page_numbering is not None + ) if has_printed_page_data: json_data = { "json_result": json_data, - "page_metadata": self.page_metadata if self.page_metadata is not None else [], - "page_number_candidates": self.page_number_candidates if self.page_number_candidates is not None else [], + "page_metadata": ( + self.page_metadata if self.page_metadata is not None else [] + ), + "page_number_candidates": ( + self.page_number_candidates + if self.page_number_candidates is not None + else [] + ), "document_page_numbering": self.document_page_numbering, } diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index c385d6b..764ab27 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -182,7 +182,9 @@ def process( for item in sorted_results: result = deepcopy(item) - result["layout_index"] = result.get("layout_index", result.get("index", 0)) + result["layout_index"] = result.get( + "layout_index", result.get("index", 0) + ) result["layout_score"] = float( result.get("layout_score", result.get("score") or 0.0) ) From 9a50c113e34e6e080e98b212cc840bde6b5d4260 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Tue, 31 Mar 2026 06:05:10 +0300 Subject: [PATCH 3/7] Add optional SDK image asset export with rendered and embedded outputs Add an SDK-owned image asset export path for PP-DocLayoutV3 image regions. Rendered region assets are now the base behavior of this feature and are written to imgs_rendered/. When enable_image_asset_export=True, the SDK additionally inspects embedded PDF images via PyMuPDF, matches them to layout image regions using same-page geometry, IoU, containment, aspect-ratio plausibility, and one-to-one assignment, and writes matched assets to imgs_embedded/. Markdown selection is controlled by markdown_image_preference ('embedded' or 'rendered'). The image block contract is explicit and stable: image_path is the selected asset path, rendered_image_path reflects the rendered asset when one exists, embedded_image_path is null when no embedded match exists, and image_asset_source records whether the selected asset is rendered or embedded. The implementation avoids center-distance-only matching, preserves formatter-produced rendered assets in self-hosted mode instead of re-deriving them, and aggressively prevents stale asset advertisement: if a rendered asset cannot actually be preserved or regenerated, final JSON and Markdown do not continue to reference it. Focused regression tests cover rendered-only mode, embedded preference, rendered preference, nested asset persistence, preservation misses, no-render-pages recovery, crop-failure recovery, and both rendered-origin and embedded-origin stale-markdown cleanup. --- glmocr/api.py | 9 +- glmocr/config.py | 39 +- glmocr/parser_result/base.py | 17 +- glmocr/pipeline/pipeline.py | 15 + glmocr/postprocess/result_formatter.py | 9 +- glmocr/tests/test_unit.py | 664 ++++++++++++++++++++++++ glmocr/utils/image_asset_utils.py | 673 +++++++++++++++++++++++++ 7 files changed, 1413 insertions(+), 13 deletions(-) create mode 100644 glmocr/utils/image_asset_utils.py diff --git a/glmocr/api.py b/glmocr/api.py index ffb218a..59f995f 100644 --- a/glmocr/api.py +++ b/glmocr/api.py @@ -29,8 +29,8 @@ from glmocr.config import load_config from glmocr.parser_result import PipelineResult +from glmocr.utils.image_asset_utils import export_image_assets from glmocr.utils.logging import get_logger, ensure_logging_configured -from glmocr.utils.markdown_utils import resolve_image_regions logger = get_logger(__name__) @@ -459,10 +459,15 @@ def _maas_response_to_pipeline_result( pages_info, ) - json_result, markdown_result, image_files = resolve_image_regions( + json_result, markdown_result, image_files = export_image_assets( json_result, markdown_result, source, + enable_image_asset_export=self.config_model.pipeline.result_formatter.enable_image_asset_export, + markdown_image_preference=self.config_model.pipeline.result_formatter.markdown_image_preference, + image_match_iou_threshold=self.config_model.pipeline.result_formatter.image_match_iou_threshold, + image_match_containment_threshold=self.config_model.pipeline.result_formatter.image_match_containment_threshold, + rendered_image_dpi=self.config_model.pipeline.result_formatter.rendered_image_dpi, ) page_metadata = None diff --git a/glmocr/config.py b/glmocr/config.py index e348db5..d8123b8 100644 --- a/glmocr/config.py +++ b/glmocr/config.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from typing import Any, Dict, Optional, Union, List +from typing import Any, Dict, Optional, Union, List, Literal import yaml from dotenv import dotenv_values @@ -50,6 +50,11 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]: "LAYOUT_DEVICE": "pipeline.layout.device", # Result formatter "DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers", + "ENABLE_IMAGE_ASSET_EXPORT": "pipeline.result_formatter.enable_image_asset_export", + "MARKDOWN_IMAGE_PREFERENCE": "pipeline.result_formatter.markdown_image_preference", + "IMAGE_MATCH_IOU_THRESHOLD": "pipeline.result_formatter.image_match_iou_threshold", + "IMAGE_MATCH_CONTAINMENT_THRESHOLD": "pipeline.result_formatter.image_match_containment_threshold", + "RENDERED_IMAGE_DPI": "pipeline.result_formatter.rendered_image_dpi", # Logging "LOG_LEVEL": "logging.level", } @@ -178,8 +183,24 @@ class ResultFormatterConfig(_BaseConfig): enable_merge_text_blocks: bool = True enable_format_bullet_points: bool = True detect_printed_page_numbers: bool = False + enable_image_asset_export: bool = False + markdown_image_preference: Literal["embedded", "rendered"] = "embedded" + image_match_iou_threshold: float = 0.5 + image_match_containment_threshold: float = 0.8 + rendered_image_dpi: int = 300 label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict) + @field_validator("markdown_image_preference") + @classmethod + def _validate_markdown_image_preference( + cls, value: str + ) -> Literal["embedded", "rendered"]: + if value not in ("embedded", "rendered"): + raise ValueError( + "markdown_image_preference must be 'embedded' or 'rendered'" + ) + return value + class LayoutConfig(_BaseConfig): model_dir: Optional[str] = None @@ -265,9 +286,20 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any: return raw.strip().lower() in ("maas", "true", "1", "yes") if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers": return raw.strip().lower() in ("true", "1", "yes", "on") + if dotted_path == "pipeline.result_formatter.enable_image_asset_export": + return raw.strip().lower() in ("true", "1", "yes", "on") # Integer fields if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")): return int(raw) + if dotted_path == "pipeline.result_formatter.rendered_image_dpi": + return int(raw) + if dotted_path.endswith( + ( + ".image_match_iou_threshold", + ".image_match_containment_threshold", + ) + ): + return float(raw) return raw @@ -435,6 +467,11 @@ def from_env( "timeout": "pipeline.maas.request_timeout", "log_level": "logging.level", "detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers", + "enable_image_asset_export": "pipeline.result_formatter.enable_image_asset_export", + "markdown_image_preference": "pipeline.result_formatter.markdown_image_preference", + "image_match_iou_threshold": "pipeline.result_formatter.image_match_iou_threshold", + "image_match_containment_threshold": "pipeline.result_formatter.image_match_containment_threshold", + "rendered_image_dpi": "pipeline.result_formatter.rendered_image_dpi", # Self-hosted OCR API "ocr_api_host": "pipeline.ocr_api.api_host", "ocr_api_port": "pipeline.ocr_api.api_port", diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py index ae8963c..5a4b423 100644 --- a/glmocr/parser_result/base.py +++ b/glmocr/parser_result/base.py @@ -40,8 +40,8 @@ def __init__( json_result: JSON result (string, dict, or list). markdown_result: Markdown result (optional). original_images: Original image paths. - image_files: Mapping of ``filename`` → PIL Image for image-type - regions, to be saved under ``imgs/`` during :meth:`save`. + image_files: Mapping of relative output path → image asset payload + for image-type regions, to be saved during :meth:`save`. raw_json_result: Raw model output before post-processing; saved as ``{name}_model.json`` alongside the final result. page_metadata: Derived per-page printed page metadata. @@ -144,13 +144,16 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: # Image files produced by the result formatter if self.image_files: - imgs_dir = output_path / "imgs" - imgs_dir.mkdir(parents=True, exist_ok=True) - for filename, img in self.image_files.items(): + for rel_path, img in self.image_files.items(): try: - img.save(imgs_dir / filename, quality=95) + target = output_path / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + if isinstance(img, (bytes, bytearray)): + target.write_bytes(bytes(img)) + else: + img.save(target, quality=95) except Exception as e: - logger.warning("Failed to save image %s: %s", filename, e) + logger.warning("Failed to save image %s: %s", rel_path, e) self.image_files = None def to_dict(self) -> dict: diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index b1c48ac..0705099 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -15,6 +15,7 @@ from __future__ import annotations +import json import time import threading from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional @@ -23,6 +24,7 @@ from glmocr.ocr_client import OCRClient from glmocr.parser_result import PipelineResult from glmocr.postprocess import ResultFormatter +from glmocr.utils.image_asset_utils import export_image_assets from glmocr.utils.logging import get_logger from glmocr.pipeline._common import ( @@ -362,6 +364,19 @@ def _emit_results( grouped, cropped_images=cropped_images or None, ) + parsed_json = json.loads(json_u) + parsed_json, md_u, image_files = export_image_assets( + parsed_json, + md_u, + original_inputs[u], + enable_image_asset_export=self.config.result_formatter.enable_image_asset_export, + markdown_image_preference=self.config.result_formatter.markdown_image_preference, + image_match_iou_threshold=self.config.result_formatter.image_match_iou_threshold, + image_match_containment_threshold=self.config.result_formatter.image_match_containment_threshold, + rendered_image_dpi=self.config.result_formatter.rendered_image_dpi, + rendered_images=image_files or None, + ) + json_u = json.dumps(parsed_json, ensure_ascii=False, indent=2) page_metadata = self.result_formatter.page_metadata page_number_candidates = self.result_formatter.page_number_candidates document_page_numbering = self.result_formatter.document_page_numbering diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index 764ab27..6797b03 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -163,7 +163,7 @@ def process( Returns: (json_str, markdown_str, image_files) where *image_files* maps - ``filename`` → PIL Image for the caller to persist. + relative output path → PIL Image for the caller to persist. """ self.page_metadata = None self.page_number_candidates = None @@ -258,9 +258,12 @@ def process( filename = ( f"{image_prefix}_page{page_idx}_idx{image_counter}.jpg" ) - rel_path = f"imgs/{filename}" - image_files[filename] = img + rel_path = f"imgs_rendered/{filename}" + image_files[rel_path] = img result["image_path"] = rel_path + result["rendered_image_path"] = rel_path + result["embedded_image_path"] = None + result["image_asset_source"] = "rendered" markdown_page_results.append( f"![Image {page_idx}-{image_counter}]({rel_path})" ) diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 7d55353..88f462f 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -33,6 +33,17 @@ def test_default_config_routes_number_to_text_ocr(self): text_labels = cfg.pipeline.layout.label_task_mapping["text"] assert "number" in text_labels + def test_image_asset_export_defaults(self): + """Image asset export defaults remain conservative and opt-in.""" + from glmocr.config import ResultFormatterConfig + + cfg = ResultFormatterConfig() + assert cfg.enable_image_asset_export is False + assert cfg.markdown_image_preference == "embedded" + assert cfg.image_match_iou_threshold == 0.5 + assert cfg.image_match_containment_threshold == 0.8 + assert cfg.rendered_image_dpi == 300 + class TestLayoutDeviceUnit: """Unit tests for layout device selection and config plumbing (mocked).""" @@ -1260,6 +1271,26 @@ def test_save_keeps_legacy_json_shape_when_detection_has_no_hits(self): assert isinstance(saved, list) + def test_save_supports_nested_image_asset_paths_and_bytes(self): + from io import BytesIO + + from PIL import Image + + buf = BytesIO() + Image.new("RGB", (4, 4), color=(255, 0, 0)).save(buf, format="PNG") + payload = buf.getvalue() + + r = self._make_result( + original_images=["paper.pdf"], + image_files={"imgs_embedded/test.png": payload}, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved_path = Path(tmp_dir, "paper", "imgs_embedded", "test.png") + assert saved_path.exists() + assert saved_path.read_bytes() == payload + def test_repr(self): r = self._make_result() assert "PipelineResult" in repr(r) @@ -1694,6 +1725,639 @@ def test_detect_printed_page_numbers_kwarg_is_forwarded(self, monkeypatch): parser.close() +class TestImageAssetExport: + """Tests for SDK-owned image asset export.""" + + def test_export_image_assets_prefers_embedded_pdf_image(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "embedded" + assert block["image_path"].startswith("imgs_embedded/") + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert any(path.startswith("imgs_embedded/") for path in image_files) + assert any(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_embedded/" in updated_md + + def test_export_image_assets_falls_back_to_rendered_when_no_embedded_match(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + bbox = [100, 100, 500, 500] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert block["embedded_image_path"] is None + assert all(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_rendered/" in updated_md + + def test_export_image_assets_rendered_only_mode_uses_imgs_rendered(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + bbox = [100, 100, 500, 500] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"] == block["image_path"] + assert block["embedded_image_path"] is None + assert all(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_rendered/" in updated_md + + def test_export_image_assets_prefers_rendered_when_configured(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [{"index": 0, "label": "image", "bbox_2d": bbox, "content": None}] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="rendered", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert "imgs_rendered/" in updated_md + + def test_preserve_rendered_assets_uses_rendered_path_over_image_path(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + rendered = Image.new("RGB", (32, 24), color=(10, 20, 30)) + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/existing.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/existing.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/existing.png)" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={"rendered_page0_idx0.jpg": rendered}, + ) + + block = updated_json[0][0] + assert block["image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + assert ( + block["rendered_image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + ) + assert block["embedded_image_path"] is None + assert "imgs_rendered/rendered_page0_idx0.jpg" in image_files + assert updated_md == "![Image](imgs_rendered/rendered_page0_idx0.jpg)" + + def test_preserve_rendered_assets_missing_key_keeps_block_coherent(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/existing.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/existing.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/existing.png)" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_rendered/") + assert ( + block["rendered_image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + ) + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" in image_files + assert "imgs_rendered/rendered_page0_idx0.jpg" in updated_md + + def test_preferred_mode_missing_rendered_key_does_not_leak_helper_fields(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + updated_json, _, _ = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert "_needs_rendered_export" not in block + assert "_previous_image_path" not in block + + def test_preferred_mode_render_recovery_failure_keeps_existing_asset_state(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + with patch( + "glmocr.utils.image_asset_utils.crop_image_region", + side_effect=RuntimeError("crop failed"), + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"] is None + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["image_asset_source"] == "embedded" + assert "imgs_embedded/" in updated_md + assert all( + not path.startswith("imgs_rendered/rendered_page0_idx0.jpg") + for path in image_files + ) + + def test_preferred_mode_render_failure_without_embedded_match_does_not_advertise_stale_rendered_asset( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "rendered", + "_needs_rendered_export": True, + } + ] + ] + markdown = "![Image 0-0](imgs_rendered/rendered_page0_idx0.jpg)" + + with ( + patch( + "glmocr.utils.image_asset_utils._load_render_pages", + return_value=[Image.new("RGB", (200, 200), color=(255, 255, 255))], + ), + patch( + "glmocr.utils.image_asset_utils.crop_image_region", + side_effect=RuntimeError("crop failed"), + ), + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + + def test_preferred_mode_no_render_pages_does_not_advertise_rendered_asset(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"] is None + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["image_asset_source"] == "embedded" + assert "imgs_embedded/embedded_page0_idx0_xref" in updated_md + assert all( + not path.startswith("imgs_rendered/rendered_page0_idx0.jpg") + for path in image_files + ) + + def test_preferred_mode_no_render_pages_and_no_embedded_match_does_not_point_to_stale_rendered_path( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "rendered", + "_needs_rendered_export": True, + } + ] + ] + markdown = "![Image](imgs_rendered/rendered_page0_idx0.jpg)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + + def test_preferred_mode_embedded_markdown_origin_is_removed_when_no_asset_survives( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/original.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "embedded", + "_needs_rendered_export": True, + "_previous_image_path": "imgs_embedded/original.png", + } + ] + ] + markdown = "![Image 0-0](imgs_embedded/original.png)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert "imgs_embedded/original.png" not in updated_md + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + + class TestOCRClientOllamaConfig: """Tests for OCRClient initialization with Ollama api_mode.""" diff --git a/glmocr/utils/image_asset_utils.py b/glmocr/utils/image_asset_utils.py new file mode 100644 index 0000000..cbe6f52 --- /dev/null +++ b/glmocr/utils/image_asset_utils.py @@ -0,0 +1,673 @@ +"""Image asset export utilities. + +Optional SDK-owned export of rendered and embedded image assets for layout image +regions. Embedded PDF images are matched geometrically; rendered crops remain the +fallback. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from PIL import Image + +from glmocr.utils.image_utils import crop_image_region, pdf_to_images_pil +from glmocr.utils.logging import get_logger + +logger = get_logger(__name__) + +try: + import fitz +except Exception: # pragma: no cover + fitz = None # type: ignore[assignment] + + +def export_image_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + enable_image_asset_export: bool, + markdown_image_preference: str, + image_match_iou_threshold: float, + image_match_containment_threshold: float, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + """Return updated JSON/Markdown plus exportable image assets. + + When disabled: + - preserve existing rendered assets if already present (self-hosted) + - otherwise create rendered crops under ``imgs_rendered/`` (MaaS / bbox-only path) + + When enabled: + - export rendered crops under ``imgs_rendered/`` + - try to export matched embedded images under ``imgs_embedded/`` + - point JSON/Markdown to the preferred asset + """ + has_images = any( + region.get("label") == "image" + for page in json_result + if isinstance(page, list) + for region in page + if isinstance(region, dict) + ) + if not has_images: + return json_result, markdown_result, rendered_images or {} + + if not enable_image_asset_export: + return _create_rendered_only_assets( + json_result, + markdown_result, + source, + rendered_image_dpi=rendered_image_dpi, + rendered_images=rendered_images, + ) + + return _create_preferred_assets( + json_result, + markdown_result, + source, + markdown_image_preference=markdown_image_preference, + image_match_iou_threshold=image_match_iou_threshold, + image_match_containment_threshold=image_match_containment_threshold, + rendered_image_dpi=rendered_image_dpi, + rendered_images=rendered_images, + ) + + +def _create_rendered_only_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + if rendered_images: + json_result, markdown_result, image_files = _preserve_rendered_assets( + json_result, markdown_result, rendered_images + ) + if not _has_pending_rendered_assets(json_result): + return ( + _strip_internal_image_fields(json_result), + markdown_result, + image_files, + ) + else: + image_files = {} + + loaded_images = _load_render_pages(source, rendered_image_dpi) + if not loaded_images: + return ( + _strip_internal_image_fields(_fill_missing_image_asset_fields(json_result)), + markdown_result, + image_files, + ) + + image_counter = 0 + updated_json: List[list] = [] + + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if ( + not isinstance(region, dict) + or region.get("label") != "image" + or page_idx >= len(loaded_images) + ): + page_copy.append(region) + continue + + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + bbox = region.get("bbox_2d") + polygon = region.get("polygon") + previous_image_path = region_copy.get("image_path") + previous_embedded_path = region_copy.get("embedded_image_path") + previous_asset_source = region_copy.get("image_asset_source", "rendered") + if bbox and ( + region_copy.get("_needs_rendered_export") + or not region_copy.get("rendered_image_path") + ): + try: + cropped = crop_image_region(loaded_images[page_idx], bbox, polygon) + rel_path = region_copy.get("rendered_image_path") or ( + f"imgs_rendered/rendered_page{page_idx}_idx{region.get('index', image_counter)}.jpg" + ) + image_files[rel_path] = cropped + region_copy["image_path"] = rel_path + region_copy["rendered_image_path"] = rel_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + replace_region = dict(region) + if region_copy.get("_previous_image_path"): + replace_region["image_path"] = region_copy[ + "_previous_image_path" + ] + markdown_result = _replace_markdown_image_reference( + markdown_result, + replace_region, + page_idx, + bbox, + rel_path, + ) + image_counter += 1 + except Exception as e: + logger.warning( + "Failed to render image asset (page=%d, bbox=%s): %s", + page_idx, + bbox, + e, + ) + region_copy["image_path"] = previous_image_path + region_copy["rendered_image_path"] = None + region_copy["embedded_image_path"] = previous_embedded_path + region_copy["image_asset_source"] = previous_asset_source + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + updated_json.append(page_copy) + + return _strip_internal_image_fields(updated_json), markdown_result, image_files + + +def _create_preferred_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + markdown_image_preference: str, + image_match_iou_threshold: float, + image_match_containment_threshold: float, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + if rendered_images: + rendered_json, markdown_result, image_files = _preserve_rendered_assets( + json_result, markdown_result, rendered_images + ) + if _has_pending_rendered_assets(rendered_json): + rendered_pages = _load_render_pages(source, rendered_image_dpi) + else: + rendered_pages = [] + json_result = rendered_json + else: + rendered_pages = _load_render_pages(source, rendered_image_dpi) + image_files = {} + embedded_by_page = _inspect_embedded_pdf_images(source) + updated_json: List[list] = [] + + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + + image_regions = [ + region + for region in page + if isinstance(region, dict) and region.get("label") == "image" + ] + matches = _match_embedded_images( + image_regions, + embedded_by_page.get(page_idx, []), + image_match_iou_threshold=image_match_iou_threshold, + image_match_containment_threshold=image_match_containment_threshold, + ) + + page_copy = [] + for region in page: + if not isinstance(region, dict) or region.get("label") != "image": + page_copy.append(region) + continue + + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + bbox = region.get("bbox_2d") + polygon = region.get("polygon") + rendered_rel_path = region_copy.get("rendered_image_path") + embedded_rel_path = None + previous_image_path = region_copy.get("image_path") + render_failed = False + rendered_asset_available = bool( + rendered_rel_path and not region_copy.get("_needs_rendered_export") + ) + + if ( + bbox + and ( + rendered_rel_path is None + or region_copy.get("_needs_rendered_export") + ) + and page_idx < len(rendered_pages) + ): + try: + rendered = crop_image_region( + rendered_pages[page_idx], bbox, polygon + ) + rendered_rel_path = rendered_rel_path or ( + f"imgs_rendered/rendered_page{page_idx}_idx{region.get('index', 0)}.jpg" + ) + image_files[rendered_rel_path] = rendered + rendered_asset_available = True + except Exception as e: + logger.warning( + "Failed to render fallback image asset (page=%d, bbox=%s): %s", + page_idx, + bbox, + e, + ) + render_failed = True + rendered_rel_path = None + + match = matches.get(int(region.get("index", 0))) + if match is not None: + embedded_rel_path = f"imgs_embedded/embedded_page{page_idx}_idx{region.get('index', 0)}_xref{match['xref']}.{match['ext']}" + image_files[embedded_rel_path] = match["image_bytes"] + + effective_rendered_path = ( + rendered_rel_path if rendered_asset_available else None + ) + chosen_path = _choose_preferred_path( + embedded_rel_path, + effective_rendered_path, + markdown_image_preference=markdown_image_preference, + ) + + if render_failed and embedded_rel_path is None: + chosen_path = None + effective_rendered_path = None + original_markdown_path = ( + region_copy.get("_previous_image_path") or previous_image_path + ) + if bbox and original_markdown_path: + markdown_result = _replace_markdown_image_reference( + markdown_result, + { + "image_path": original_markdown_path, + "index": region.get("index", 0), + }, + page_idx, + bbox, + "", + ) + elif ( + region_copy.get("_needs_rendered_export") + and not rendered_asset_available + and embedded_rel_path is None + ): + chosen_path = None + original_markdown_path = ( + region_copy.get("_previous_image_path") or previous_image_path + ) + if bbox and original_markdown_path: + markdown_result = _replace_markdown_image_reference( + markdown_result, + { + "image_path": original_markdown_path, + "index": region.get("index", 0), + }, + page_idx, + bbox, + "", + ) + + if chosen_path is not None and bbox: + replace_region = dict(region) + if region_copy.get("_previous_image_path"): + replace_region["image_path"] = region_copy["_previous_image_path"] + elif previous_image_path: + replace_region["image_path"] = previous_image_path + markdown_result = _replace_markdown_image_reference( + markdown_result, replace_region, page_idx, bbox, chosen_path + ) + region_copy["image_path"] = chosen_path + if rendered_asset_available and effective_rendered_path is not None: + region_copy["rendered_image_path"] = effective_rendered_path + else: + region_copy["rendered_image_path"] = None + if render_failed and embedded_rel_path is None: + rendered_rel_path = None + region_copy["embedded_image_path"] = None + region_copy["rendered_image_path"] = None + region_copy["image_asset_source"] = "rendered" + region_copy["image_path"] = None + else: + region_copy["embedded_image_path"] = embedded_rel_path + if embedded_rel_path is None and not rendered_asset_available: + region_copy["image_asset_source"] = "rendered" + region_copy["image_path"] = None + else: + region_copy["image_asset_source"] = ( + "embedded" if chosen_path == embedded_rel_path else "rendered" + ) + if render_failed and embedded_rel_path is not None: + region_copy["rendered_image_path"] = None + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + + updated_json.append(page_copy) + + return _strip_internal_image_fields(updated_json), markdown_result, image_files + + +def _load_render_pages(source: str, rendered_image_dpi: int) -> List[Image.Image]: + path = Path(source) + try: + if path.suffix.lower() == ".pdf" and path.is_file(): + return pdf_to_images_pil( + str(path), + dpi=rendered_image_dpi, + max_width_or_height=6000, + ) + if path.is_file(): + img = Image.open(str(path)) + if img.mode != "RGB": + img = img.convert("RGB") + return [img] + except Exception as e: + logger.warning("Cannot load source %s for image asset export: %s", source, e) + return [] + + +def _inspect_embedded_pdf_images(source: str) -> Dict[int, List[Dict[str, Any]]]: + if fitz is None: + return {} + path = Path(source) + if path.suffix.lower() != ".pdf" or not path.is_file(): + return {} + + doc = fitz.open(str(path)) + by_page: Dict[int, List[Dict[str, Any]]] = {} + try: + for page_index in range(len(doc)): + page = doc[page_index] + width = float(page.rect.width) or 1.0 + height = float(page.rect.height) or 1.0 + instances: List[Dict[str, Any]] = [] + for image in page.get_images(full=True): + xref = int(image[0]) + try: + extracted = doc.extract_image(xref) + rects = page.get_image_rects(xref, transform=True) + except Exception: + continue + for placement_idx, placement in enumerate(rects): + rect = placement[0] if isinstance(placement, tuple) else placement + bbox_norm = [ + rect.x0 / width, + rect.y0 / height, + rect.x1 / width, + rect.y1 / height, + ] + instances.append( + { + "xref": xref, + "ext": extracted.get("ext", "bin"), + "image_bytes": extracted.get("image", b""), + "width": int(extracted.get("width") or image[2] or 0), + "height": int(extracted.get("height") or image[3] or 0), + "bbox_norm": bbox_norm, + "placement_index": placement_idx, + } + ) + if instances: + by_page[page_index] = instances + finally: + doc.close() + return by_page + + +def _match_embedded_images( + image_regions: List[Dict[str, Any]], + embedded_instances: List[Dict[str, Any]], + *, + image_match_iou_threshold: float, + image_match_containment_threshold: float, +) -> Dict[int, Dict[str, Any]]: + candidates: List[Tuple[float, int, int]] = [] + for region in image_regions: + bbox = region.get("bbox_2d") + if not bbox or len(bbox) != 4: + continue + region_idx = int(region.get("index", 0)) + region_bbox = [coord / 1000.0 for coord in bbox] + region_ar = _bbox_aspect_ratio(region_bbox) + for embedded_idx, embedded in enumerate(embedded_instances): + embedded_bbox = embedded["bbox_norm"] + iou = _bbox_iou(region_bbox, embedded_bbox) + containment = _bbox_containment(region_bbox, embedded_bbox) + if ( + iou < image_match_iou_threshold + and containment < image_match_containment_threshold + ): + continue + embedded_ar = _bbox_aspect_ratio(embedded_bbox) + if not _aspect_ratio_plausible(region_ar, embedded_ar): + continue + area_ratio = _bbox_area_ratio(region_bbox, embedded_bbox) + score = max(iou, containment) + area_ratio * 0.1 + candidates.append((score, region_idx, embedded_idx)) + + candidates.sort(reverse=True) + assigned_regions = set() + assigned_embedded = set() + matches: Dict[int, Dict[str, Any]] = {} + for _, region_idx, embedded_idx in candidates: + if region_idx in assigned_regions or embedded_idx in assigned_embedded: + continue + matches[region_idx] = embedded_instances[embedded_idx] + assigned_regions.add(region_idx) + assigned_embedded.add(embedded_idx) + return matches + + +def _bbox_iou(a: List[float], b: List[float]) -> float: + ax0, ay0, ax1, ay1 = a + bx0, by0, bx1, by1 = b + inter_x0 = max(ax0, bx0) + inter_y0 = max(ay0, by0) + inter_x1 = min(ax1, bx1) + inter_y1 = min(ay1, by1) + if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: + return 0.0 + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + union = _bbox_area(a) + _bbox_area(b) - inter_area + return inter_area / union if union > 0 else 0.0 + + +def _bbox_containment(a: List[float], b: List[float]) -> float: + ax0, ay0, ax1, ay1 = a + bx0, by0, bx1, by1 = b + inter_x0 = max(ax0, bx0) + inter_y0 = max(ay0, by0) + inter_x1 = min(ax1, bx1) + inter_y1 = min(ay1, by1) + if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: + return 0.0 + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + return max(inter_area / _bbox_area(a), inter_area / _bbox_area(b)) + + +def _bbox_area(bbox: List[float]) -> float: + x0, y0, x1, y1 = bbox + return max(0.0, x1 - x0) * max(0.0, y1 - y0) + + +def _bbox_area_ratio(a: List[float], b: List[float]) -> float: + area_a = _bbox_area(a) + area_b = _bbox_area(b) + if area_a <= 0 or area_b <= 0: + return 0.0 + return min(area_a, area_b) / max(area_a, area_b) + + +def _bbox_aspect_ratio(bbox: List[float]) -> float: + x0, y0, x1, y1 = bbox + width = max(1e-9, x1 - x0) + height = max(1e-9, y1 - y0) + return width / height + + +def _aspect_ratio_plausible(a: float, b: float) -> bool: + ratio = max(a, b) / max(min(a, b), 1e-9) + return ratio <= 2.0 + + +def _choose_preferred_path( + embedded_rel_path: Optional[str], + rendered_rel_path: Optional[str], + *, + markdown_image_preference: str, +) -> Optional[str]: + if markdown_image_preference == "rendered": + return rendered_rel_path or embedded_rel_path + if embedded_rel_path: + return embedded_rel_path + return rendered_rel_path + + +def _replace_markdown_image_reference( + markdown_result: str, + region: Dict[str, Any], + page_idx: int, + bbox: List[int], + new_path: str, +) -> str: + old_path = region.get("image_path") + if old_path: + if not new_path: + import re + + pattern = re.compile(rf"!\[[^\]]*\]\({re.escape(old_path)}\)") + return pattern.sub("", markdown_result, count=1) + return markdown_result.replace(f"({old_path})", f"({new_path})", 1) + old_tag = f"![](page={page_idx},bbox={bbox})" + if not new_path: + return markdown_result.replace(old_tag, "", 1) + new_tag = f"![Image {page_idx}-{region.get('index', 0)}]({new_path})" + return markdown_result.replace(old_tag, new_tag, 1) + + +def _preserve_rendered_assets( + json_result: List[list], + markdown_result: str, + rendered_images: Dict[str, Any], +) -> Tuple[List[list], str, Dict[str, Any]]: + updated_json: List[list] = [] + normalized_images: Dict[str, Any] = {} + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if not isinstance(region, dict) or region.get("label") != "image": + page_copy.append(region) + continue + region_copy = dict(region) + image_path = region_copy.get("image_path") + rendered_path = region_copy.get("rendered_image_path") or image_path + if rendered_path: + filename = ( + rendered_path.split("/", 1)[-1] + if isinstance(rendered_path, str) + else None + ) + source_key = None + if isinstance(rendered_path, str) and rendered_path in rendered_images: + source_key = rendered_path + elif filename and filename in rendered_images: + source_key = filename + if source_key: + normalized_images[rendered_path] = rendered_images[source_key] + region_copy["image_path"] = rendered_path + region_copy["rendered_image_path"] = rendered_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + bbox = region_copy.get("bbox_2d") + if bbox: + markdown_result = _replace_markdown_image_reference( + markdown_result, + region, + page_idx, + bbox, + rendered_path, + ) + else: + previous_image_path = region_copy.get("image_path") + region_copy["image_path"] = rendered_path + region_copy["rendered_image_path"] = rendered_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + region_copy["_needs_rendered_export"] = True + region_copy["_previous_image_path"] = previous_image_path + else: + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + page_copy.append(region_copy) + updated_json.append(page_copy) + return updated_json, markdown_result, normalized_images + + +def _has_pending_rendered_assets(json_result: List[list]) -> bool: + for page in json_result: + if not isinstance(page, list): + continue + for region in page: + if isinstance(region, dict) and region.get("_needs_rendered_export"): + return True + return False + + +def _fill_missing_image_asset_fields(json_result: List[list]) -> List[list]: + updated_json: List[list] = [] + for page in json_result: + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if isinstance(region, dict) and region.get("label") == "image": + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + page_copy.append(region_copy) + else: + page_copy.append(region) + updated_json.append(page_copy) + return updated_json + + +def _strip_internal_image_fields(json_result: List[list]) -> List[list]: + updated_json: List[list] = [] + for page in json_result: + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if isinstance(region, dict): + region_copy = dict(region) + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + else: + page_copy.append(region) + updated_json.append(page_copy) + return updated_json From 7ef278336db29b873f389ed90e6713e305bb19f1 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Tue, 31 Mar 2026 06:17:49 +0300 Subject: [PATCH 4/7] Harden image asset recovery and document SDK image export Close the remaining stale-asset recovery gaps in the SDK image export path so final JSON and Markdown do not advertise rendered assets unless they were actually preserved or produced. This covers both no-render-pages and explicit crop-failure branches, including cases where stale markdown originally pointed at embedded assets. Also document the image asset export feature in README.md and README_zh.md, including the exposed configuration surface, the imgs_rendered/ and imgs_embedded/ directory contract, and the stable image block fields: image_path, rendered_image_path, embedded_image_path, and image_asset_source. --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ README_zh.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/README.md b/README.md index 2ea7807..f6c9a29 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,11 @@ with GlmOcr(detect_printed_page_numbers=True) as parser: result = parser.parse("document.pdf") print(result.to_dict().get("page_metadata", [])) +# Export image assets with rendered and embedded outputs +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") + result.save() + # Place layout model on CPU (useful when GPU is reserved for OCR) with GlmOcr(layout_device="cpu") as parser: result = parser.parse("image.png") @@ -308,6 +313,11 @@ pipeline: result_formatter: output_format: both # json, markdown, or both detect_printed_page_numbers: false + enable_image_asset_export: false + markdown_image_preference: embedded # embedded | rendered + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 # Layout model device placement layout: @@ -333,6 +343,23 @@ pipeline: detect_printed_page_numbers: true ``` +Image asset export can also be enabled from Python or YAML: + +```python +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") +``` + +```yaml +pipeline: + result_formatter: + enable_image_asset_export: true + markdown_image_preference: embedded + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 +``` + ### Output Formats Here are two examples of output formats: @@ -380,6 +407,24 @@ saved `paper.json` is wrapped as a top-level object and includes: } ``` +When image asset export is enabled, image-like blocks can additionally expose: + +```json +{ + "label": "image", + "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg", + "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "image_asset_source": "embedded" +} +``` + +Behavior summary: +- rendered image assets are written to `imgs_rendered/` +- if `enable_image_asset_export=true`, matched embedded PDF images are also written to `imgs_embedded/` +- `image_path` follows `markdown_image_preference` +- `embedded_image_path` is `null` when no embedded match exists + - Markdown ```markdown diff --git a/README_zh.md b/README_zh.md index 71504f0..c63a66d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -219,6 +219,11 @@ with GlmOcr() as parser: with GlmOcr(detect_printed_page_numbers=True) as parser: result = parser.parse("document.pdf") print(result.to_dict().get("page_metadata", [])) + +# 导出渲染图像与嵌入式 PDF 图像资产 +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") + result.save() ``` #### Flask 服务 @@ -293,6 +298,11 @@ pipeline: result_formatter: output_format: both # json, markdown, or both detect_printed_page_numbers: false + enable_image_asset_export: false + markdown_image_preference: embedded # embedded | rendered + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 ``` 更多选项请参考 [config.yaml](glmocr/config.yaml)。 @@ -314,6 +324,23 @@ pipeline: detect_printed_page_numbers: true ``` +图像资产导出也可以通过 Python 或 YAML 启用: + +```python +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") +``` + +```yaml +pipeline: + result_formatter: + enable_image_asset_export: true + markdown_image_preference: embedded + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 +``` + ### 输出格式 这里给出两种输出格式示例: @@ -360,6 +387,24 @@ pipeline: } ``` +启用图像资产导出后,图像类区域还会额外暴露: + +```json +{ + "label": "image", + "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg", + "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "image_asset_source": "embedded" +} +``` + +行为说明: +- 渲染图像资产写入 `imgs_rendered/` +- 当 `enable_image_asset_export=true` 时,匹配成功的嵌入式 PDF 图像还会写入 `imgs_embedded/` +- `image_path` 会根据 `markdown_image_preference` 选择最终引用的资产 +- 若没有嵌入式匹配,`embedded_image_path` 为 `null` + - Markdown ```markdown From 2fbb2b618010809b66dbd02bfe3eef92865bbb4d Mon Sep 17 00:00:00 2001 From: VooDisss Date: Tue, 31 Mar 2026 22:11:37 +0300 Subject: [PATCH 5/7] Fix PP-DocLayoutV3 head aliasing in layout loader GLM-OCR loaded PPDocLayoutV3ForObjectDetection directly from the published Hugging Face checkpoint, but the checkpoint stores the tied detection head weights under model.enc_score_head.* and model.enc_bbox_head.layers.* while the object-detection wrapper expects model.decoder.class_embed.* and model.decoder.bbox_embed.layers.*. In practice this caused the decoder detection heads to be treated as missing and newly initialized, which surfaced as startup warnings, unstable layout behavior, and degraded self-hosted OCR results. The fix keeps the change narrow: load the PP-DocLayoutV3 config separately, load model.safetensors directly, alias the tied encoder-head keys onto the decoder-head names before model construction, and instantiate the model with from_pretrained(None, config=..., state_dict=...). This avoids broader runtime recovery logic and keeps the compatibility repair at the checkpoint-loading boundary where the mismatch actually occurs. The background investigation included local inspection of the cached safetensors checkpoint, installed transformers 5.4.0 PP-DocLayoutV3 source, Paddle inference artifacts, and upstream release context. The key finding was that the checkpoint is not headless: the trained head weights are present under enc_* names, and the local transformers implementation explicitly declares decoder.class_embed <-> enc_score_head and decoder.bbox_embed <-> enc_bbox_head as tied/shared weight groups. That made aliasing the minimal defensible fix for GLM-OCR rather than reworking the full layout runtime. Tests were updated only as needed for the new load path. Existing detector device-selection tests now stub the config and prepared state-dict helpers, and a focused unit test verifies that _prepare_pp_doclayout_state_dict aliases encoder-head weights into the decoder-head keys expected by the object-detection wrapper. Validation also included a real self-hosted pipeline run over local PDFs, where the old missing decoder-head load report disappeared and processing completed successfully after the fix. --- glmocr/layout/layout_detector.py | 47 ++++++++++++++++++++++- glmocr/tests/test_unit.py | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/glmocr/layout/layout_detector.py b/glmocr/layout/layout_detector.py index c4f1680..c83e442 100644 --- a/glmocr/layout/layout_detector.py +++ b/glmocr/layout/layout_detector.py @@ -2,13 +2,17 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING, List, Dict import cv2 import torch import numpy as np from PIL import Image +from huggingface_hub import hf_hub_download +from safetensors.torch import load_file from transformers import ( + PPDocLayoutV3Config, PPDocLayoutV3ForObjectDetection, PPDocLayoutV3ImageProcessorFast, ) @@ -56,6 +60,43 @@ def __init__(self, config: "LayoutConfig"): self._image_processor = None self._device = None + def _resolve_model_weights_path(self) -> Path: + """Return the local safetensors path for the configured model.""" + model_path = Path(self.model_dir) + if model_path.is_dir(): + return model_path / "model.safetensors" + return Path( + hf_hub_download(repo_id=self.model_dir, filename="model.safetensors") + ) + + def _prepare_pp_doclayout_state_dict(self) -> Dict[str, torch.Tensor]: + """Alias tied PP-DocLayoutV3 detection head weights before model load.""" + state_dict = load_file(str(self._resolve_model_weights_path())) + alias_pairs = { + "model.enc_score_head.weight": "model.decoder.class_embed.weight", + "model.enc_score_head.bias": "model.decoder.class_embed.bias", + "model.enc_bbox_head.layers.0.weight": "model.decoder.bbox_embed.layers.0.weight", + "model.enc_bbox_head.layers.0.bias": "model.decoder.bbox_embed.layers.0.bias", + "model.enc_bbox_head.layers.1.weight": "model.decoder.bbox_embed.layers.1.weight", + "model.enc_bbox_head.layers.1.bias": "model.decoder.bbox_embed.layers.1.bias", + "model.enc_bbox_head.layers.2.weight": "model.decoder.bbox_embed.layers.2.weight", + "model.enc_bbox_head.layers.2.bias": "model.decoder.bbox_embed.layers.2.bias", + } + + remapped = [] + for source_key, target_key in alias_pairs.items(): + if source_key in state_dict and target_key not in state_dict: + state_dict[target_key] = state_dict[source_key].clone() + remapped.append(target_key) + + if remapped: + logger.warning( + "Prepared PP-DocLayoutV3 state dict with decoder head aliases: %s", + ", ".join(remapped), + ) + + return state_dict + def start(self): """Load model and processor once in the main process.""" logger.debug("Initializing PP-DocLayoutV3...") @@ -63,7 +104,11 @@ def start(self): self._image_processor = PPDocLayoutV3ImageProcessorFast.from_pretrained( self.model_dir ) - self._model = PPDocLayoutV3ForObjectDetection.from_pretrained(self.model_dir) + self._model = PPDocLayoutV3ForObjectDetection.from_pretrained( + None, + config=PPDocLayoutV3Config.from_pretrained(self.model_dir), + state_dict=self._prepare_pp_doclayout_state_dict(), + ) self._model.eval() # Device selection priority: diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 88f462f..2481c2f 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -145,10 +145,15 @@ def test_detector_device_selection_explicit_cpu(self): "glmocr.layout.layout_detector.PPDocLayoutV3ForObjectDetection.from_pretrained", return_value=mock_model, ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), patch( "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", return_value=mock_proc, ), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -164,10 +169,15 @@ def test_detector_device_selection_explicit_cuda(self): "glmocr.layout.layout_detector.PPDocLayoutV3ForObjectDetection.from_pretrained", return_value=mock_model, ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), patch( "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", return_value=mock_proc, ), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -185,11 +195,16 @@ def test_detector_device_selection_auto_fallback_cpu(self): "glmocr.layout.layout_detector.PPDocLayoutV3ForObjectDetection.from_pretrained", return_value=mock_model, ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), patch( "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=False), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -218,16 +233,65 @@ def test_detector_device_selection_auto_cuda(self): "glmocr.layout.layout_detector.PPDocLayoutV3ForObjectDetection.from_pretrained", return_value=mock_model, ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), patch( "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=True), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() assert det._device == "cuda:1" + def test_detector_prepares_pp_doclayout_decoder_head_aliases(self): + """State dict preparation aliases tied encoder head weights for load.""" + torch = self._require_layout_runtime() + + det, _, _ = self._mock_detector("cpu") + state_dict = { + "model.enc_score_head.weight": torch.full((3, 4), 1.25), + "model.enc_score_head.bias": torch.full((3,), 2.5), + "model.enc_bbox_head.layers.0.weight": torch.full((4, 4), 1.0), + "model.enc_bbox_head.layers.0.bias": torch.full((4,), 11.0), + "model.enc_bbox_head.layers.1.weight": torch.full((4, 4), 2.0), + "model.enc_bbox_head.layers.1.bias": torch.full((4,), 12.0), + "model.enc_bbox_head.layers.2.weight": torch.full((4, 4), 3.0), + "model.enc_bbox_head.layers.2.bias": torch.full((4,), 13.0), + } + + with patch.object( + det, + "_resolve_model_weights_path", + return_value=Path("dummy-model.safetensors"), + ), patch( + "glmocr.layout.layout_detector.load_file", + return_value=state_dict.copy(), + ): + prepared = det._prepare_pp_doclayout_state_dict() + + assert torch.equal( + prepared["model.decoder.class_embed.weight"], + state_dict["model.enc_score_head.weight"], + ) + assert torch.equal( + prepared["model.decoder.class_embed.bias"], + state_dict["model.enc_score_head.bias"], + ) + for idx in range(3): + assert torch.equal( + prepared[f"model.decoder.bbox_embed.layers.{idx}.weight"], + state_dict[f"model.enc_bbox_head.layers.{idx}.weight"], + ) + assert torch.equal( + prepared[f"model.decoder.bbox_embed.layers.{idx}.bias"], + state_dict[f"model.enc_bbox_head.layers.{idx}.bias"], + ) + class TestPageLoader: """Tests for PageLoader.""" From 6999f29128af4d3c7d2da4e98ac3fdc45fa772b6 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Tue, 31 Mar 2026 22:44:50 +0300 Subject: [PATCH 6/7] Replace deprecated PP-DocLayout image processor Follow up the PP-DocLayoutV3 checkpoint aliasing fix by switching the layout detector from PPDocLayoutV3ImageProcessorFast to PPDocLayoutV3ImageProcessor. Under the current transformers 5.4.0 runtime, the Fast-suffixed processor emits a deprecation warning on every worker startup even though the rest of the layout path is functioning correctly. This change is intentionally narrow. It does not alter the checkpoint aliasing logic, model loading strategy, layout post-processing, or device-selection behavior. The only production change is to use the non-deprecated image processor entry point that transformers now expects. Tests were updated only where the detector startup path mocks the image processor loader. The need for this cleanup was confirmed by a real self-hosted OCR pipeline run after the head-aliasing fix landed. That run showed successful PP-DocLayoutV3 startup and processing, but still printed the deprecation warning telling callers to use PPDocLayoutV3ImageProcessor instead of the Fast variant. Replacing the import and matching test patches removes that remaining startup warning without widening the scope of the loader fix. Validation included re-running the focused detector test slice covering detector device selection and PP-DocLayout decoder-head aliasing, which passed after the rename. A subsequent real OCR pipeline run on local PDFs also started and processed documents without the previous deprecation warning, confirming that the cleanup behaves correctly in the actual self-hosted path. --- glmocr/layout/layout_detector.py | 4 ++-- glmocr/tests/test_unit.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/glmocr/layout/layout_detector.py b/glmocr/layout/layout_detector.py index c83e442..1e73970 100644 --- a/glmocr/layout/layout_detector.py +++ b/glmocr/layout/layout_detector.py @@ -14,7 +14,7 @@ from transformers import ( PPDocLayoutV3Config, PPDocLayoutV3ForObjectDetection, - PPDocLayoutV3ImageProcessorFast, + PPDocLayoutV3ImageProcessor, ) from glmocr.layout.base import BaseLayoutDetector @@ -101,7 +101,7 @@ def start(self): """Load model and processor once in the main process.""" logger.debug("Initializing PP-DocLayoutV3...") - self._image_processor = PPDocLayoutV3ImageProcessorFast.from_pretrained( + self._image_processor = PPDocLayoutV3ImageProcessor.from_pretrained( self.model_dir ) self._model = PPDocLayoutV3ForObjectDetection.from_pretrained( diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 2481c2f..ba1a875 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -150,7 +150,7 @@ def test_detector_device_selection_explicit_cpu(self): return_value=MagicMock(), ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), @@ -174,7 +174,7 @@ def test_detector_device_selection_explicit_cuda(self): return_value=MagicMock(), ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), @@ -200,7 +200,7 @@ def test_detector_device_selection_auto_fallback_cpu(self): return_value=MagicMock(), ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=False), @@ -238,7 +238,7 @@ def test_detector_device_selection_auto_cuda(self): return_value=MagicMock(), ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=True), From 09456d1df304d8a56fee2b13be392d305d4577d3 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Thu, 2 Apr 2026 00:49:04 +0300 Subject: [PATCH 7/7] Fix according to precommit checks --- glmocr/tests/test_unit.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index ba1a875..d2a5473 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -264,13 +264,16 @@ def test_detector_prepares_pp_doclayout_decoder_head_aliases(self): "model.enc_bbox_head.layers.2.bias": torch.full((4,), 13.0), } - with patch.object( - det, - "_resolve_model_weights_path", - return_value=Path("dummy-model.safetensors"), - ), patch( - "glmocr.layout.layout_detector.load_file", - return_value=state_dict.copy(), + with ( + patch.object( + det, + "_resolve_model_weights_path", + return_value=Path("dummy-model.safetensors"), + ), + patch( + "glmocr.layout.layout_detector.load_file", + return_value=state_dict.copy(), + ), ): prepared = det._prepare_pp_doclayout_state_dict()