zai-org · VooDisss · Mar 30, 2026 · Mar 30, 2026
diff --git a/README.md b/README.md
@@ -222,6 +222,11 @@ with GlmOcr() as parser:
     print(result.json_result)
     result.save()
 
+# Extract printed page numbers from PP-DocLayoutV3 `number` regions
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+    print(result.to_dict().get("page_metadata", []))
+
 # Place layout model on CPU (useful when GPU is reserved for OCR)
 with GlmOcr(layout_device="cpu") as parser:
     result = parser.parse("image.png")
@@ -302,6 +307,7 @@ pipeline:
   # Result formatting
   result_formatter:
     output_format: both # json, markdown, or both
+    detect_printed_page_numbers: false
 
   # Layout model device placement
   layout:
@@ -310,6 +316,23 @@ pipeline:
 
 See [config.yaml](glmocr/config.yaml) for all options.
 
+Printed page number detection can be enabled in three ways:
+
+```python
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```powershell
+$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    detect_printed_page_numbers: true
+```
+
 ### Output Formats
 
 Here are two examples of output formats:
@@ -320,6 +343,43 @@ Here are two examples of output formats:
 [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
 ```
 
+When printed page detection is enabled and printed-page data is actually found,
+saved `paper.json` is wrapped as a top-level object and includes:
+
+```json
+{
+  "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
+  "page_number_candidates": [
+    {
+      "page_index": 1,
+      "label": "number",
+      "content": "22",
+      "layout_index": 0,
+      "bbox_2d": [92, 26, 120, 41],
+      "layout_score": 0.77,
+      "numeric_like": true,
+      "roman_like": false
+    }
+  ],
+  "document_page_numbering": {
+    "strategy": "visual_sequence",
+    "confidence": 1.0,
+    "sequence_type": "arabic",
+    "page_offset": 21,
+    "candidate_pages": 4
+  },
+  "page_metadata": [
+    {
+      "page_index": 1,
+      "printed_page_label": "22",
+      "printed_page_block_index": 0,
+      "printed_page_bbox_2d": [92, 26, 120, 41],
+      "printed_page_confidence": 0.77
+    }
+  ]
+}
+```
+
 - Markdown
 
 ```markdown

diff --git a/README_zh.md b/README_zh.md
@@ -214,6 +214,11 @@ with GlmOcr() as parser:
     result = parser.parse("image.png")
     print(result.json_result)
     result.save()
+
+# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+    print(result.to_dict().get("page_metadata", []))
 ```
 
 #### Flask 服务
@@ -287,10 +292,28 @@ pipeline:
   # Result formatting
   result_formatter:
     output_format: both # json, markdown, or both
+    detect_printed_page_numbers: false
 ```
 
 更多选项请参考 [config.yaml](glmocr/config.yaml)。
 
+印刷页码检测支持以下三种启用方式：
+
+```python
+with GlmOcr(detect_printed_page_numbers=True) as parser:
+    result = parser.parse("document.pdf")
+```
+
+```powershell
+$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
+```
+
+```yaml
+pipeline:
+  result_formatter:
+    detect_printed_page_numbers: true
+```
+
 ### 输出格式
 
 这里给出两种输出格式示例：
@@ -301,6 +324,42 @@ pipeline:
 [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
 ```
 
+启用印刷页码检测且实际检测到印刷页码数据时，保存的 `paper.json` 会变成顶层对象，并包含：
+
+```json
+{
+  "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
+  "page_number_candidates": [
+    {
+      "page_index": 1,
+      "label": "number",
+      "content": "22",
+      "layout_index": 0,
+      "bbox_2d": [92, 26, 120, 41],
+      "layout_score": 0.77,
+      "numeric_like": true,
+      "roman_like": false
+    }
+  ],
+  "document_page_numbering": {
+    "strategy": "visual_sequence",
+    "confidence": 1.0,
+    "sequence_type": "arabic",
+    "page_offset": 21,
+    "candidate_pages": 4
+  },
+  "page_metadata": [
+    {
+      "page_index": 1,
+      "printed_page_label": "22",
+      "printed_page_block_index": 0,
+      "printed_page_bbox_2d": [92, 26, 120, 41],
+      "printed_page_confidence": 0.77
+    }
+  ]
+}
+```
+
 - Markdown
 
 ```markdown

diff --git a/glmocr/api.py b/glmocr/api.py
@@ -84,6 +84,7 @@ def __init__(
         ocr_api_port: Optional[int] = None,
         cuda_visible_devices: Optional[str] = None,
         layout_device: Optional[str] = None,
+        detect_printed_page_numbers: Optional[bool] = None,
         **kwargs: Any,
     ):
         """Initialize GlmOcr.
@@ -130,6 +131,7 @@ def __init__(
             ocr_api_port=ocr_api_port,
             cuda_visible_devices=cuda_visible_devices,
             layout_device=layout_device,
+            detect_printed_page_numbers=detect_printed_page_numbers,
             **kwargs,
         )
         # Apply logging config for API/SDK usage.
@@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result(
                     {
                         "index": region.get("index", 0),
                         "label": region.get("label", "text"),
+                        "native_label": region.get("label", "text"),
                         "content": region.get("content", ""),
                         "bbox_2d": bbox,
+                        "layout_index": region.get("index", 0),
+                        "layout_score": float(region.get("score") or 0.0),
                     }
                 )
             json_result.append(page_result)
@@ -460,12 +465,32 @@ def _maas_response_to_pipeline_result(
             source,
         )
 
+        page_metadata = None
+        page_number_candidates = None
+        document_page_numbering = None
+        if self.config_model.pipeline.result_formatter.detect_printed_page_numbers:
+            from glmocr.postprocess import ResultFormatter
+
+            formatter = ResultFormatter(self.config_model.pipeline.result_formatter)
+            (
+                page_number_candidates,
+                document_page_numbering,
+                page_metadata,
+            ) = formatter.extract_printed_page_data(json_result)
+
+        from glmocr.postprocess import ResultFormatter
+
+        ResultFormatter._strip_layout_metadata(json_result)
+
         # Create PipelineResult
         result = PipelineResult(
             json_result=json_result,
             markdown_result=markdown_result,
             original_images=[source],
             image_files=image_files or None,
+            page_metadata=page_metadata,
+            page_number_candidates=page_number_candidates,
+            document_page_numbering=document_page_numbering,
         )
 
         # Store additional MaaS response data

diff --git a/glmocr/config.py b/glmocr/config.py
@@ -48,6 +48,8 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]:
     "LAYOUT_CUDA_VISIBLE_DEVICES": "pipeline.layout.cuda_visible_devices",
     # Explicit device for layout model: "cpu", "cuda", "cuda:0", etc.
     "LAYOUT_DEVICE": "pipeline.layout.device",
+    # Result formatter
+    "DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers",
     # Logging
     "LOG_LEVEL": "logging.level",
 }
@@ -175,6 +177,7 @@ class ResultFormatterConfig(_BaseConfig):
     enable_merge_formula_numbers: bool = True
     enable_merge_text_blocks: bool = True
     enable_format_bullet_points: bool = True
+    detect_printed_page_numbers: bool = False
     label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict)
 
 
@@ -260,6 +263,8 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any:
     # Boolean fields
     if dotted_path == "pipeline.maas.enabled":
         return raw.strip().lower() in ("maas", "true", "1", "yes")
+    if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers":
+        return raw.strip().lower() in ("true", "1", "yes", "on")
     # Integer fields
     if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")):
         return int(raw)
@@ -429,6 +434,7 @@ def from_env(
             "mode": "pipeline.maas.enabled",
             "timeout": "pipeline.maas.request_timeout",
             "log_level": "logging.level",
+            "detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers",
             # Self-hosted OCR API
             "ocr_api_host": "pipeline.ocr_api.api_host",
             "ocr_api_port": "pipeline.ocr_api.api_port",

diff --git a/glmocr/config.yaml b/glmocr/config.yaml
@@ -164,6 +164,7 @@ pipeline:
         - content
         - doc_title
         - figure_title
+        - number
         - paragraph_title
         - reference_content
         - text
@@ -256,6 +257,7 @@ pipeline:
         - content
         - doc_title
         - figure_title
+        - number
         - paragraph_title
         - reference_content
         - text
@@ -274,7 +276,6 @@ pipeline:
       abandon:
         - header
         - footer
-        - number
         - footnote
         - aside_text
         - reference

diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py
@@ -30,6 +30,9 @@ def __init__(
         original_images: Optional[List[str]] = None,
         image_files: Optional[Dict[str, Any]] = None,
         raw_json_result: Optional[list] = None,
+        page_metadata: Optional[List[Dict[str, Any]]] = None,
+        page_number_candidates: Optional[List[Dict[str, Any]]] = None,
+        document_page_numbering: Optional[Dict[str, Any]] = None,
     ):
         """Initialize.
 
@@ -41,6 +44,9 @@ def __init__(
                 regions, to be saved under ``imgs/`` during :meth:`save`.
             raw_json_result: Raw model output before post-processing;
                 saved as ``{name}_model.json`` alongside the final result.
+            page_metadata: Derived per-page printed page metadata.
+            page_number_candidates: Raw printed page-number candidates.
+            document_page_numbering: Document-level numbering inference.
         """
         if isinstance(json_result, str):
             try:
@@ -56,6 +62,9 @@ def __init__(
         ]
         self.image_files = image_files
         self.raw_json_result = raw_json_result
+        self.page_metadata = page_metadata
+        self.page_number_candidates = page_number_candidates
+        self.document_page_numbering = document_page_numbering
 
     @abstractmethod
     def save(
@@ -88,6 +97,27 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None:
                     json_data = json.loads(json_data)
                 except json.JSONDecodeError:
                     pass
+
+            has_printed_page_data = (
+                bool(self.page_metadata)
+                or bool(self.page_number_candidates)
+                or self.document_page_numbering is not None
+            )
+
+            if has_printed_page_data:
+                json_data = {
+                    "json_result": json_data,
+                    "page_metadata": (
+                        self.page_metadata if self.page_metadata is not None else []
+                    ),
+                    "page_number_candidates": (
+                        self.page_number_candidates
+                        if self.page_number_candidates is not None
+                        else []
+                    ),
+                    "document_page_numbering": self.document_page_numbering,
+                }
+
             with open(json_file, "w", encoding="utf-8") as f:
                 if isinstance(json_data, (dict, list)):
                     json.dump(json_data, f, ensure_ascii=False, indent=2)
@@ -134,6 +164,12 @@ def to_dict(self) -> dict:
             "markdown_result": self.markdown_result or "",
             "original_images": self.original_images,
         }
+        if self.page_metadata is not None:
+            d["page_metadata"] = self.page_metadata
+        if self.page_number_candidates is not None:
+            d["page_number_candidates"] = self.page_number_candidates
+        if self.document_page_numbering is not None:
+            d["document_page_numbering"] = self.document_page_numbering
         # Include optional metadata set by MaaS mode.
         for attr in ("_usage", "_data_info", "_error"):
             val = getattr(self, attr, None)

diff --git a/glmocr/parser_result/pipeline_result.py b/glmocr/parser_result/pipeline_result.py
@@ -26,6 +26,9 @@ def __init__(
         image_files: Optional[dict] = None,
         raw_json_result: Optional[list] = None,
         layout_vis_images: Optional[Dict[int, Any]] = None,
+        page_metadata: Optional[List[Dict[str, Any]]] = None,
+        page_number_candidates: Optional[List[Dict[str, Any]]] = None,
+        document_page_numbering: Optional[Dict[str, Any]] = None,
     ):
         """Initialize.
 
@@ -38,13 +41,19 @@ def __init__(
             raw_json_result: Raw model output before post-processing (optional).
             layout_vis_images: Mapping of ``page_idx`` → PIL Image for layout
                 visualization; saved to ``layout_vis/`` during :meth:`save`.
+            page_metadata: Derived per-page printed page metadata.
+            page_number_candidates: Raw printed page-number candidates.
+            document_page_numbering: Document-level numbering inference.
         """
         super().__init__(
             json_result=json_result,
             markdown_result=markdown_result,
             original_images=original_images,
             image_files=image_files,
             raw_json_result=raw_json_result,
+            page_metadata=page_metadata,
+            page_number_candidates=page_number_candidates,
+            document_page_numbering=document_page_numbering,
         )
         self.layout_vis_images = layout_vis_images