zai-org · VooDisss · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/README.md b/README.md
@@ -222,6 +222,16 @@ with GlmOcr() as parser:
     print(result.json_result)
     result.save()
 
+# Extract printed page numbers from PP-DocLayoutV3 `number` regions
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+    print(result.to_dict().get("page_metadata", []))
+
+# Export image assets with rendered and embedded outputs
+with GlmOcr(enable_image_asset_export=True) as parser:
+    result = parser.parse("document.pdf")
+    result.save()
+
 # Place layout model on CPU (useful when GPU is reserved for OCR)
 with GlmOcr(layout_device="cpu") as parser:
     result = parser.parse("image.png")
@@ -302,6 +312,12 @@ pipeline:
   # Result formatting
   result_formatter:
     output_format: both # json, markdown, or both
+    detect_printed_page_numbers: false
+    enable_image_asset_export: false
+    markdown_image_preference: embedded # embedded | rendered
+    image_match_iou_threshold: 0.5
+    image_match_containment_threshold: 0.8
+    rendered_image_dpi: 300
 
   # Layout model device placement
   layout:
@@ -310,6 +326,40 @@ pipeline:
 
 See [config.yaml](glmocr/config.yaml) for all options.
 
+Printed page number detection can be enabled in three ways:
+
+```python
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```powershell
+$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    detect_printed_page_numbers: true
+```
+
+Image asset export can also be enabled from Python or YAML:
+
+```python
+with GlmOcr(enable_image_asset_export=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    enable_image_asset_export: true
+    markdown_image_preference: embedded
+    image_match_iou_threshold: 0.5
+    image_match_containment_threshold: 0.8
+    rendered_image_dpi: 300
+```
+
 ### Output Formats
 
 Here are two examples of output formats:
@@ -320,6 +370,61 @@ Here are two examples of output formats:
 [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
 ```
 
+When printed page detection is enabled and printed-page data is actually found,
+saved `paper.json` is wrapped as a top-level object and includes:
+
+```json
+{
+  "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
+  "page_number_candidates": [
+    {
+      "page_index": 1,
+      "label": "number",
+      "content": "22",
+      "layout_index": 0,
+      "bbox_2d": [92, 26, 120, 41],
+      "layout_score": 0.77,
+      "numeric_like": true,
+      "roman_like": false
+    }
+  ],
+  "document_page_numbering": {
+    "strategy": "visual_sequence",
+    "confidence": 1.0,
+    "sequence_type": "arabic",
+    "page_offset": 21,
+    "candidate_pages": 4
+  },
+  "page_metadata": [
+    {
+      "page_index": 1,
+      "printed_page_label": "22",
+      "printed_page_block_index": 0,
+      "printed_page_bbox_2d": [92, 26, 120, 41],
+      "printed_page_confidence": 0.77
+    }
+  ]
+}
+```
+
+When image asset export is enabled, image-like blocks can additionally expose:
+
+```json
+{
+  "label": "image",
+  "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg",
+  "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg",
+  "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg",
+  "image_asset_source": "embedded"
+}
+```
+
+Behavior summary:
+- rendered image assets are written to `imgs_rendered/`
+- if `enable_image_asset_export=true`, matched embedded PDF images are also written to `imgs_embedded/`
+- `image_path` follows `markdown_image_preference`
+- `embedded_image_path` is `null` when no embedded match exists
+
 - Markdown
 
 ```markdown

diff --git a/README_zh.md b/README_zh.md
@@ -214,6 +214,16 @@ with GlmOcr() as parser:
     result = parser.parse("image.png")
     print(result.json_result)
     result.save()
+
+# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+    print(result.to_dict().get("page_metadata", []))
+
+# 导出渲染图像与嵌入式 PDF 图像资产
+with GlmOcr(enable_image_asset_export=True) as parser:
+    result = parser.parse("document.pdf")
+    result.save()
 ```
 
 #### Flask 服务
@@ -287,10 +297,50 @@ pipeline:
   # Result formatting
   result_formatter:
     output_format: both # json, markdown, or both
+    detect_printed_page_numbers: false
+    enable_image_asset_export: false
+    markdown_image_preference: embedded # embedded | rendered
+    image_match_iou_threshold: 0.5
+    image_match_containment_threshold: 0.8
+    rendered_image_dpi: 300
 ```
 
 更多选项请参考 [config.yaml](glmocr/config.yaml)。
 
+印刷页码检测支持以下三种启用方式：
+
+```python
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```powershell
+$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    detect_printed_page_numbers: true
+```
+
+图像资产导出也可以通过 Python 或 YAML 启用：
+
+```python
+with GlmOcr(enable_image_asset_export=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    enable_image_asset_export: true
+    markdown_image_preference: embedded
+    image_match_iou_threshold: 0.5
+    image_match_containment_threshold: 0.8
+    rendered_image_dpi: 300
+```
+
 ### 输出格式
 
 这里给出两种输出格式示例：
@@ -301,6 +351,60 @@ pipeline:
 [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
 ```
 
+启用印刷页码检测且实际检测到印刷页码数据时，保存的 `paper.json` 会变成顶层对象，并包含：
+
+```json
+{
+  "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
+  "page_number_candidates": [
+    {
+      "page_index": 1,
+      "label": "number",
+      "content": "22",
+      "layout_index": 0,
+      "bbox_2d": [92, 26, 120, 41],
+      "layout_score": 0.77,
+      "numeric_like": true,
+      "roman_like": false
+    }
+  ],
+  "document_page_numbering": {
+    "strategy": "visual_sequence",
+    "confidence": 1.0,
+    "sequence_type": "arabic",
+    "page_offset": 21,
+    "candidate_pages": 4
+  },
+  "page_metadata": [
+    {
+      "page_index": 1,
+      "printed_page_label": "22",
+      "printed_page_block_index": 0,
+      "printed_page_bbox_2d": [92, 26, 120, 41],
+      "printed_page_confidence": 0.77
+    }
+  ]
+}
+```
+
+启用图像资产导出后，图像类区域还会额外暴露：
+
+```json
+{
+  "label": "image",
+  "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg",
+  "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg",
+  "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg",
+  "image_asset_source": "embedded"
+}
+```
+
+行为说明：
+- 渲染图像资产写入 `imgs_rendered/`
+- 当 `enable_image_asset_export=true` 时，匹配成功的嵌入式 PDF 图像还会写入 `imgs_embedded/`
+- `image_path` 会根据 `markdown_image_preference` 选择最终引用的资产
+- 若没有嵌入式匹配，`embedded_image_path` 为 `null`
+
 - Markdown
 
 ```markdown

diff --git a/glmocr/api.py b/glmocr/api.py
@@ -29,8 +29,8 @@
 
 from glmocr.config import load_config
 from glmocr.parser_result import PipelineResult
+from glmocr.utils.image_asset_utils import export_image_assets
 from glmocr.utils.logging import get_logger, ensure_logging_configured
-from glmocr.utils.markdown_utils import resolve_image_regions
 
 logger = get_logger(__name__)
 
@@ -84,6 +84,7 @@ def __init__(
         ocr_api_port: Optional[int] = None,
         cuda_visible_devices: Optional[str] = None,
         layout_device: Optional[str] = None,
+        detect_printed_page_numbers: Optional[bool] = None,
         **kwargs: Any,
     ):
         """Initialize GlmOcr.
@@ -130,6 +131,7 @@ def __init__(
             ocr_api_port=ocr_api_port,
             cuda_visible_devices=cuda_visible_devices,
             layout_device=layout_device,
+            detect_printed_page_numbers=detect_printed_page_numbers,
             **kwargs,
         )
         # Apply logging config for API/SDK usage.
@@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result(
                     {
                         "index": region.get("index", 0),
                         "label": region.get("label", "text"),
+                        "native_label": region.get("label", "text"),
                         "content": region.get("content", ""),
                         "bbox_2d": bbox,
+                        "layout_index": region.get("index", 0),
+                        "layout_score": float(region.get("score") or 0.0),
                     }
                 )
             json_result.append(page_result)
@@ -454,18 +459,43 @@ def _maas_response_to_pipeline_result(
             pages_info,
         )
 
-        json_result, markdown_result, image_files = resolve_image_regions(
+        json_result, markdown_result, image_files = export_image_assets(
             json_result,
             markdown_result,
             source,
+            enable_image_asset_export=self.config_model.pipeline.result_formatter.enable_image_asset_export,
+            markdown_image_preference=self.config_model.pipeline.result_formatter.markdown_image_preference,
+            image_match_iou_threshold=self.config_model.pipeline.result_formatter.image_match_iou_threshold,
+            image_match_containment_threshold=self.config_model.pipeline.result_formatter.image_match_containment_threshold,
+            rendered_image_dpi=self.config_model.pipeline.result_formatter.rendered_image_dpi,
         )
 
+        page_metadata = None
+        page_number_candidates = None
+        document_page_numbering = None
+        if self.config_model.pipeline.result_formatter.detect_printed_page_numbers:
+            from glmocr.postprocess import ResultFormatter
+
+            formatter = ResultFormatter(self.config_model.pipeline.result_formatter)
+            (
+                page_number_candidates,
+                document_page_numbering,
+                page_metadata,
+            ) = formatter.extract_printed_page_data(json_result)
+
+        from glmocr.postprocess import ResultFormatter
+
+        ResultFormatter._strip_layout_metadata(json_result)
+
         # Create PipelineResult
         result = PipelineResult(
             json_result=json_result,
             markdown_result=markdown_result,
             original_images=[source],
             image_files=image_files or None,
+            page_metadata=page_metadata,
+            page_number_candidates=page_number_candidates,
+            document_page_numbering=document_page_numbering,
         )
 
         # Store additional MaaS response data