diff --git a/README.md b/README.md index 7635611..2ea7807 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,11 @@ with GlmOcr() as parser: print(result.json_result) result.save() +# Extract printed page numbers from PP-DocLayoutV3 `number` regions +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) + # Place layout model on CPU (useful when GPU is reserved for OCR) with GlmOcr(layout_device="cpu") as parser: result = parser.parse("image.png") @@ -302,6 +307,7 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false # Layout model device placement layout: @@ -310,6 +316,23 @@ pipeline: See [config.yaml](glmocr/config.yaml) for all options. +Printed page number detection can be enabled in three ways: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + ### Output Formats Here are two examples of output formats: @@ -320,6 +343,43 @@ Here are two examples of output formats: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +When printed page detection is enabled and printed-page data is actually found, +saved `paper.json` is wrapped as a top-level object and includes: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + - Markdown ```markdown diff --git a/README_zh.md b/README_zh.md index 8a2f3fe..71504f0 100644 --- a/README_zh.md +++ b/README_zh.md @@ -214,6 +214,11 @@ with GlmOcr() as parser: result = parser.parse("image.png") print(result.json_result) result.save() + +# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码 +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) ``` #### Flask 服务 @@ -287,10 +292,28 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false ``` 更多选项请参考 [config.yaml](glmocr/config.yaml)。 +印刷页码检测支持以下三种启用方式: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + ### 输出格式 这里给出两种输出格式示例: @@ -301,6 +324,42 @@ pipeline: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +启用印刷页码检测且实际检测到印刷页码数据时,保存的 `paper.json` 会变成顶层对象,并包含: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + - Markdown ```markdown diff --git a/glmocr/api.py b/glmocr/api.py index dcf5570..ffb218a 100644 --- a/glmocr/api.py +++ b/glmocr/api.py @@ -84,6 +84,7 @@ def __init__( ocr_api_port: Optional[int] = None, cuda_visible_devices: Optional[str] = None, layout_device: Optional[str] = None, + detect_printed_page_numbers: Optional[bool] = None, **kwargs: Any, ): """Initialize GlmOcr. @@ -130,6 +131,7 @@ def __init__( ocr_api_port=ocr_api_port, cuda_visible_devices=cuda_visible_devices, layout_device=layout_device, + detect_printed_page_numbers=detect_printed_page_numbers, **kwargs, ) # Apply logging config for API/SDK usage. @@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result( { "index": region.get("index", 0), "label": region.get("label", "text"), + "native_label": region.get("label", "text"), "content": region.get("content", ""), "bbox_2d": bbox, + "layout_index": region.get("index", 0), + "layout_score": float(region.get("score") or 0.0), } ) json_result.append(page_result) @@ -460,12 +465,32 @@ def _maas_response_to_pipeline_result( source, ) + page_metadata = None + page_number_candidates = None + document_page_numbering = None + if self.config_model.pipeline.result_formatter.detect_printed_page_numbers: + from glmocr.postprocess import ResultFormatter + + formatter = ResultFormatter(self.config_model.pipeline.result_formatter) + ( + page_number_candidates, + document_page_numbering, + page_metadata, + ) = formatter.extract_printed_page_data(json_result) + + from glmocr.postprocess import ResultFormatter + + ResultFormatter._strip_layout_metadata(json_result) + # Create PipelineResult result = PipelineResult( json_result=json_result, markdown_result=markdown_result, original_images=[source], image_files=image_files or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) # Store additional MaaS response data diff --git a/glmocr/config.py b/glmocr/config.py index 789e206..e348db5 100644 --- a/glmocr/config.py +++ b/glmocr/config.py @@ -48,6 +48,8 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]: "LAYOUT_CUDA_VISIBLE_DEVICES": "pipeline.layout.cuda_visible_devices", # Explicit device for layout model: "cpu", "cuda", "cuda:0", etc. "LAYOUT_DEVICE": "pipeline.layout.device", + # Result formatter + "DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers", # Logging "LOG_LEVEL": "logging.level", } @@ -175,6 +177,7 @@ class ResultFormatterConfig(_BaseConfig): enable_merge_formula_numbers: bool = True enable_merge_text_blocks: bool = True enable_format_bullet_points: bool = True + detect_printed_page_numbers: bool = False label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict) @@ -260,6 +263,8 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any: # Boolean fields if dotted_path == "pipeline.maas.enabled": return raw.strip().lower() in ("maas", "true", "1", "yes") + if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers": + return raw.strip().lower() in ("true", "1", "yes", "on") # Integer fields if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")): return int(raw) @@ -429,6 +434,7 @@ def from_env( "mode": "pipeline.maas.enabled", "timeout": "pipeline.maas.request_timeout", "log_level": "logging.level", + "detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers", # Self-hosted OCR API "ocr_api_host": "pipeline.ocr_api.api_host", "ocr_api_port": "pipeline.ocr_api.api_port", diff --git a/glmocr/config.yaml b/glmocr/config.yaml index 8c287fe..5679570 100644 --- a/glmocr/config.yaml +++ b/glmocr/config.yaml @@ -164,6 +164,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -256,6 +257,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -274,7 +276,6 @@ pipeline: abandon: - header - footer - - number - footnote - aside_text - reference diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py index 0b996c2..ae8963c 100644 --- a/glmocr/parser_result/base.py +++ b/glmocr/parser_result/base.py @@ -30,6 +30,9 @@ def __init__( original_images: Optional[List[str]] = None, image_files: Optional[Dict[str, Any]] = None, raw_json_result: Optional[list] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -41,6 +44,9 @@ def __init__( regions, to be saved under ``imgs/`` during :meth:`save`. raw_json_result: Raw model output before post-processing; saved as ``{name}_model.json`` alongside the final result. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ if isinstance(json_result, str): try: @@ -56,6 +62,9 @@ def __init__( ] self.image_files = image_files self.raw_json_result = raw_json_result + self.page_metadata = page_metadata + self.page_number_candidates = page_number_candidates + self.document_page_numbering = document_page_numbering @abstractmethod def save( @@ -88,6 +97,27 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: json_data = json.loads(json_data) except json.JSONDecodeError: pass + + has_printed_page_data = ( + bool(self.page_metadata) + or bool(self.page_number_candidates) + or self.document_page_numbering is not None + ) + + if has_printed_page_data: + json_data = { + "json_result": json_data, + "page_metadata": ( + self.page_metadata if self.page_metadata is not None else [] + ), + "page_number_candidates": ( + self.page_number_candidates + if self.page_number_candidates is not None + else [] + ), + "document_page_numbering": self.document_page_numbering, + } + with open(json_file, "w", encoding="utf-8") as f: if isinstance(json_data, (dict, list)): json.dump(json_data, f, ensure_ascii=False, indent=2) @@ -134,6 +164,12 @@ def to_dict(self) -> dict: "markdown_result": self.markdown_result or "", "original_images": self.original_images, } + if self.page_metadata is not None: + d["page_metadata"] = self.page_metadata + if self.page_number_candidates is not None: + d["page_number_candidates"] = self.page_number_candidates + if self.document_page_numbering is not None: + d["document_page_numbering"] = self.document_page_numbering # Include optional metadata set by MaaS mode. for attr in ("_usage", "_data_info", "_error"): val = getattr(self, attr, None) diff --git a/glmocr/parser_result/pipeline_result.py b/glmocr/parser_result/pipeline_result.py index 800084c..7a9e9d9 100644 --- a/glmocr/parser_result/pipeline_result.py +++ b/glmocr/parser_result/pipeline_result.py @@ -26,6 +26,9 @@ def __init__( image_files: Optional[dict] = None, raw_json_result: Optional[list] = None, layout_vis_images: Optional[Dict[int, Any]] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -38,6 +41,9 @@ def __init__( raw_json_result: Raw model output before post-processing (optional). layout_vis_images: Mapping of ``page_idx`` → PIL Image for layout visualization; saved to ``layout_vis/`` during :meth:`save`. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ super().__init__( json_result=json_result, @@ -45,6 +51,9 @@ def __init__( original_images=original_images, image_files=image_files, raw_json_result=raw_json_result, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) self.layout_vis_images = layout_vis_images diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index 699f48d..b1c48ac 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -362,6 +362,9 @@ def _emit_results( grouped, cropped_images=cropped_images or None, ) + page_metadata = self.result_formatter.page_metadata + page_number_candidates = self.result_formatter.page_number_candidates + document_page_numbering = self.result_formatter.document_page_numbering vis_images = {} for pi in page_indices: @@ -378,6 +381,9 @@ def _emit_results( image_files=image_files or None, raw_json_result=raw_json, layout_vis_images=vis_images or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) built.add(u) if preserve_order: diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index 8b31d5d..764ab27 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -12,10 +12,11 @@ from __future__ import annotations +import collections import re import json from copy import deepcopy -from typing import TYPE_CHECKING, List, Dict, Tuple, Any +from typing import TYPE_CHECKING, List, Dict, Tuple, Any, Optional try: # Optional dependency for better English word validation quality. from wordfreq import zipf_frequency @@ -70,6 +71,10 @@ def __init__(self, config: "ResultFormatterConfig"): self.enable_merge_formula_numbers = config.enable_merge_formula_numbers self.enable_merge_text_blocks = config.enable_merge_text_blocks self.enable_format_bullet_points = config.enable_format_bullet_points + self.detect_printed_page_numbers = config.detect_printed_page_numbers + self.page_metadata: Optional[List[Dict[str, Any]]] = None + self.page_number_candidates: Optional[List[Dict[str, Any]]] = None + self.document_page_numbering: Optional[Dict[str, Any]] = None # ========================================================================= # OCR-only mode @@ -160,6 +165,10 @@ def process( (json_str, markdown_str, image_files) where *image_files* maps ``filename`` → PIL Image for the caller to persist. """ + self.page_metadata = None + self.page_number_candidates = None + self.document_page_numbering = None + json_final_results = [] with profiler.measure("format_regions"): @@ -173,6 +182,12 @@ def process( for item in sorted_results: result = deepcopy(item) + result["layout_index"] = result.get( + "layout_index", result.get("index", 0) + ) + result["layout_score"] = float( + result.get("layout_score", result.get("score") or 0.0) + ) result["native_label"] = result.get("label", "text") # Map labels @@ -215,6 +230,15 @@ def process( json_final_results.append(json_page_results) + if self.detect_printed_page_numbers: + ( + self.page_number_candidates, + self.document_page_numbering, + self.page_metadata, + ) = self.extract_printed_page_data(json_final_results) + + self._strip_layout_metadata(json_final_results) + # Generate markdown results and resolve image regions image_files: Dict[str, Any] = {} image_counter = 0 @@ -251,6 +275,190 @@ def process( return json_str, markdown_str, image_files + def extract_printed_page_data( + self, + pages: List[List[Dict[str, Any]]], + ) -> Tuple[ + List[Dict[str, Any]], + Optional[Dict[str, Any]], + List[Dict[str, Any]], + ]: + """Extract number candidates and derived printed page metadata.""" + candidates = self._extract_page_number_candidates(pages) + document_page_numbering = self._infer_document_page_numbering(candidates) + page_metadata = self._build_printed_page_metadata(candidates) + return candidates, document_page_numbering, page_metadata + + def _extract_page_number_candidates( + self, + pages: List[List[Dict[str, Any]]], + ) -> List[Dict[str, Any]]: + """Extract raw `number` candidates for printed page inference.""" + candidates: List[Dict[str, Any]] = [] + for page_index, page_blocks in enumerate(pages): + for block in page_blocks: + candidate = self._build_page_number_candidate(page_index, block) + if candidate is not None: + candidates.append(candidate) + return candidates + + def _build_page_number_candidate( + self, + page_index: int, + block: Dict[str, Any], + ) -> Optional[Dict[str, Any]]: + """Build a normalized page-number candidate from one layout block.""" + if block.get("native_label") != "number": + return None + + bbox = block.get("bbox_2d") + if not isinstance(bbox, list) or len(bbox) != 4: + return None + + label = self._normalize_printed_page_label(block.get("content")) + if label is None: + return None + + x1, y1, x2, y2 = bbox + width = x2 - x1 + height = y2 - y1 + if width <= 0 or height <= 0 or width > 140 or height > 120: + return None + if not self._is_margin_candidate(x1, y1, x2, y2): + return None + + return { + "page_index": page_index, + "label": "number", + "content": label, + "layout_index": block.get("layout_index", block.get("index", 0)), + "bbox_2d": bbox, + "layout_score": float(block.get("layout_score") or 0.0), + "numeric_like": label.isdigit(), + "roman_like": self._is_roman_like(label), + } + + @staticmethod + def _is_margin_candidate(x1: int, y1: int, x2: int, y2: int) -> bool: + """Return whether a candidate lies in a plausible page-margin folio area.""" + in_margin_band = y1 <= 120 or y2 >= 880 + in_outer_margin = x1 <= 180 or x2 >= 820 + return in_margin_band and in_outer_margin + + @staticmethod + def _is_roman_like(content: str) -> bool: + """Check whether a label looks like a Roman numeral folio.""" + return bool(re.fullmatch(r"(?i)[ivxlcdm]+", content)) + + def _infer_document_page_numbering( + self, + candidates: List[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + """Infer document-level numbering from number-only candidates.""" + if not candidates: + return None + + best_candidates = self._best_candidates_by_page(candidates) + page_count = len(best_candidates) + numeric_candidates = [c for c in best_candidates if c["numeric_like"]] + roman_candidates = [c for c in best_candidates if c["roman_like"]] + + if numeric_candidates: + offsets = collections.Counter( + int(c["content"]) - int(c["page_index"]) for c in numeric_candidates + ) + page_offset, support = offsets.most_common(1)[0] + return { + "strategy": "visual_sequence", + "confidence": round(support / max(1, page_count), 3), + "sequence_type": "arabic", + "page_offset": page_offset, + "candidate_pages": page_count, + } + + if roman_candidates: + return { + "strategy": "visual_sequence", + "confidence": round(len(roman_candidates) / max(1, page_count), 3), + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": len(roman_candidates), + } + + return None + + def _build_printed_page_metadata( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Build per-page printed page metadata from selected candidates.""" + if not candidates: + return [] + + metadata: List[Dict[str, Any]] = [] + for candidate in self._best_candidates_by_page(candidates): + metadata.append( + { + "page_index": candidate["page_index"], + "printed_page_label": candidate["content"], + "printed_page_block_index": candidate["layout_index"], + "printed_page_bbox_2d": candidate["bbox_2d"], + "printed_page_confidence": candidate["layout_score"], + } + ) + return metadata + + def _best_candidates_by_page( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Select the strongest candidate per page.""" + by_page: Dict[int, List[Dict[str, Any]]] = collections.defaultdict(list) + for candidate in candidates: + by_page[int(candidate["page_index"])].append(candidate) + return [ + min(by_page[page_index], key=self._candidate_sort_key) + for page_index in sorted(by_page) + ] + + @staticmethod + def _candidate_sort_key(block: Dict[str, Any]) -> tuple[int, int, int, int]: + """Prefer blocks nearest to outer top/bottom page margins.""" + bbox = block.get("bbox_2d") or [0, 0, 1000, 1000] + x1, y1, x2, y2 = bbox + top_distance = y1 + bottom_distance = 1000 - y2 + edge_distance = min(top_distance, bottom_distance) + side_distance = min(x1, 1000 - x2) + return ( + edge_distance, + side_distance, + -int(block.get("layout_score", 0) * 1000), + int(block.get("layout_index", block.get("index", 0))), + ) + + @staticmethod + def _normalize_printed_page_label(content: Any) -> Optional[str]: + """Normalize OCR text from a printed page-number candidate.""" + if not isinstance(content, str): + return None + label = content.strip() + if not label or len(label) > 12: + return None + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9\-./]*", label): + return None + if not (re.search(r"\d", label) or ResultFormatter._is_roman_like(label)): + return None + return label + + @staticmethod + def _strip_layout_metadata(pages: List[List[Dict[str, Any]]]) -> None: + """Remove broad layout-only metadata from final JSON blocks.""" + for page in pages: + for block in page: + block.pop("layout_index", None) + block.pop("layout_score", None) + # ========================================================================= # Content handling # ========================================================================= diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 62ba50d..7d55353 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -1,6 +1,7 @@ """Unit tests for glmocr (no external services required).""" import json +import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -24,6 +25,14 @@ def test_config_to_dict(self): cfg = load_config().to_dict() assert isinstance(cfg, dict) + def test_default_config_routes_number_to_text_ocr(self): + """Default SDK config preserves PP-DocLayoutV3 number regions for OCR.""" + from glmocr.config import load_config + + cfg = load_config() + text_labels = cfg.pipeline.layout.label_task_mapping["text"] + assert "number" in text_labels + class TestLayoutDeviceUnit: """Unit tests for layout device selection and config plumbing (mocked).""" @@ -494,6 +503,164 @@ def test_result_formatter_clean_content(self): cleaned = formatter._clean_content("Hello....World") assert "....." not in cleaned + def test_result_formatter_feature_off_keeps_json_result_lean(self): + """Feature disabled does not leak broad layout metadata into json_result.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter(ResultFormatterConfig()) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert parsed[0][0]["native_label"] == "number" + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + + def test_result_formatter_extracts_page_number_data(self): + """Formatter extracts printed page data from number blocks.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.88 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + + def test_result_formatter_ignores_non_margin_number_blocks(self): + """Formatter ignores number blocks that are not in page margins.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [400, 400, 428, 430], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + assert formatter.page_metadata == [] + assert formatter.page_number_candidates == [] + assert formatter.document_page_numbering is None + + def test_result_formatter_accepts_roman_number_candidates(self): + """Formatter preserves Roman numeral number candidates.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "iv", + "bbox_2d": [944, 12, 972, 42], + "score": 0.75, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.75 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "iv", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.75, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "iv", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.75, + "numeric_like": False, + "roman_like": True, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": 1, + } + class TestMaaSClient: """Tests for MaaSClient.""" @@ -845,6 +1012,17 @@ def test_no_env_returns_empty(self, monkeypatch): assert _collect_env_overrides() == {} + def test_detect_printed_page_numbers_env_var(self, monkeypatch): + """Printed page detection can be enabled via environment variable.""" + from glmocr.config import _collect_env_overrides + + monkeypatch.setenv("GLMOCR_DETECT_PRINTED_PAGE_NUMBERS", "true") + overrides = _collect_env_overrides() + assert ( + overrides["pipeline"]["result_formatter"]["detect_printed_page_numbers"] + is True + ) + class TestFromEnv: """Tests for GlmOcrConfig.from_env() – full priority chain.""" @@ -981,6 +1159,107 @@ def test_to_json_unicode_preserved(self): # ensure_ascii=False by default → raw CJK characters assert "中文测试" in s + def test_to_dict_includes_printed_page_fields(self): + r = self._make_result( + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + d = r.to_dict() + assert d["page_metadata"][0]["printed_page_label"] == "12" + assert d["page_number_candidates"][0]["label"] == "number" + assert d["document_page_numbering"]["page_offset"] == 12 + + def test_save_wraps_json_with_printed_page_fields(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert "json_result" in saved + assert saved["page_metadata"][0]["printed_page_label"] == "12" + assert saved["page_number_candidates"][0]["label"] == "number" + assert saved["document_page_numbering"]["page_offset"] == 12 + + def test_save_keeps_legacy_json_shape_without_printed_page_data(self): + r = self._make_result(original_images=["paper.pdf"]) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + + def test_save_keeps_legacy_json_shape_when_detection_has_no_hits(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[], + page_number_candidates=[], + document_page_numbering=None, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + def test_repr(self): r = self._make_result() assert "PipelineResult" in repr(r) @@ -1251,6 +1530,93 @@ def test_parse_stream_selfhosted_delegates(self): preserve_order=True, ) + def test_maas_response_includes_printed_page_metadata_when_enabled(self): + """MaaS conversion derives printed page data from number blocks.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig, ResultFormatterConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + parser.config_model.pipeline.result_formatter = ResultFormatterConfig( + detect_printed_page_numbers=True + ) + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + assert result.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert result.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + assert result.page_metadata[0]["printed_page_label"] == "12" + + def test_maas_response_feature_off_keeps_json_result_lean(self): + """MaaS conversion does not leak broad layout metadata when feature is off.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + block = result.json_result[0][0] + assert block["native_label"] == "number" + assert "layout_index" not in block + assert "layout_score" not in block + class TestGlmOcrConstructor: """Tests for GlmOcr.__init__ kwarg handling (config assembly only).""" @@ -1308,6 +1674,25 @@ def test_selfhosted_model_kwarg_is_forwarded_to_ocr_api(self, monkeypatch): assert parser.config_model.pipeline.ocr_api.model == "glm-ocr" parser.close() + def test_detect_printed_page_numbers_kwarg_is_forwarded(self, monkeypatch): + """Public constructor flag enables printed page detection in config.""" + from glmocr.config import _ENV_MAP, ENV_PREFIX + + for suffix in _ENV_MAP: + monkeypatch.delenv(f"{ENV_PREFIX}{suffix}", raising=False) + monkeypatch.setattr("glmocr.config._find_dotenv", lambda: None) + + with patch("glmocr.maas_client.MaaSClient") as mock_maas: + mock_maas.return_value.start = MagicMock() + from glmocr.api import GlmOcr + + parser = GlmOcr(api_key="sk-test", detect_printed_page_numbers=True) + assert ( + parser.config_model.pipeline.result_formatter.detect_printed_page_numbers + is True + ) + parser.close() + class TestOCRClientOllamaConfig: """Tests for OCRClient initialization with Ollama api_mode."""