Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,11 @@ with GlmOcr() as parser:
print(result.json_result)
result.save()

# Extract printed page numbers from PP-DocLayoutV3 `number` regions
with GlmOcr(detect_printed_page_numbers=True) as parser:
result = parser.parse("document.pdf")
print(result.to_dict().get("page_metadata", []))

# Place layout model on CPU (useful when GPU is reserved for OCR)
with GlmOcr(layout_device="cpu") as parser:
result = parser.parse("image.png")
Expand Down Expand Up @@ -302,6 +307,7 @@ pipeline:
# Result formatting
result_formatter:
output_format: both # json, markdown, or both
detect_printed_page_numbers: false

# Layout model device placement
layout:
Expand All @@ -310,6 +316,23 @@ pipeline:

See [config.yaml](glmocr/config.yaml) for all options.

Printed page number detection can be enabled in three ways:

```python
with GlmOcr(detect_printed_page_numbers=True) as parser:
result = parser.parse("document.pdf")
```

```powershell
$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
```

```yaml
pipeline:
result_formatter:
detect_printed_page_numbers: true
```

### Output Formats

Here are two examples of output formats:
Expand All @@ -320,6 +343,43 @@ Here are two examples of output formats:
[[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
```

When printed page detection is enabled and printed-page data is actually found,
saved `paper.json` is wrapped as a top-level object and includes:

```json
{
"json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
"page_number_candidates": [
{
"page_index": 1,
"label": "number",
"content": "22",
"layout_index": 0,
"bbox_2d": [92, 26, 120, 41],
"layout_score": 0.77,
"numeric_like": true,
"roman_like": false
}
],
"document_page_numbering": {
"strategy": "visual_sequence",
"confidence": 1.0,
"sequence_type": "arabic",
"page_offset": 21,
"candidate_pages": 4
},
"page_metadata": [
{
"page_index": 1,
"printed_page_label": "22",
"printed_page_block_index": 0,
"printed_page_bbox_2d": [92, 26, 120, 41],
"printed_page_confidence": 0.77
}
]
}
```

- Markdown

```markdown
Expand Down
59 changes: 59 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ with GlmOcr() as parser:
result = parser.parse("image.png")
print(result.json_result)
result.save()

# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码
with GlmOcr(detect_printed_page_numbers=True) as parser:
result = parser.parse("document.pdf")
print(result.to_dict().get("page_metadata", []))
```

#### Flask 服务
Expand Down Expand Up @@ -287,10 +292,28 @@ pipeline:
# Result formatting
result_formatter:
output_format: both # json, markdown, or both
detect_printed_page_numbers: false
```

更多选项请参考 [config.yaml](glmocr/config.yaml)。

印刷页码检测支持以下三种启用方式:

```python
with GlmOcr(detect_printed_page_numbers=True) as parser:
result = parser.parse("document.pdf")
```

```powershell
$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true'
```

```yaml
pipeline:
result_formatter:
detect_printed_page_numbers: true
```

### 输出格式

这里给出两种输出格式示例:
Expand All @@ -301,6 +324,42 @@ pipeline:
[[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]]
```

启用印刷页码检测且实际检测到印刷页码数据时,保存的 `paper.json` 会变成顶层对象,并包含:

```json
{
"json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]],
"page_number_candidates": [
{
"page_index": 1,
"label": "number",
"content": "22",
"layout_index": 0,
"bbox_2d": [92, 26, 120, 41],
"layout_score": 0.77,
"numeric_like": true,
"roman_like": false
}
],
"document_page_numbering": {
"strategy": "visual_sequence",
"confidence": 1.0,
"sequence_type": "arabic",
"page_offset": 21,
"candidate_pages": 4
},
"page_metadata": [
{
"page_index": 1,
"printed_page_label": "22",
"printed_page_block_index": 0,
"printed_page_bbox_2d": [92, 26, 120, 41],
"printed_page_confidence": 0.77
}
]
}
```

- Markdown

```markdown
Expand Down
25 changes: 25 additions & 0 deletions glmocr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def __init__(
ocr_api_port: Optional[int] = None,
cuda_visible_devices: Optional[str] = None,
layout_device: Optional[str] = None,
detect_printed_page_numbers: Optional[bool] = None,
**kwargs: Any,
):
"""Initialize GlmOcr.
Expand Down Expand Up @@ -130,6 +131,7 @@ def __init__(
ocr_api_port=ocr_api_port,
cuda_visible_devices=cuda_visible_devices,
layout_device=layout_device,
detect_printed_page_numbers=detect_printed_page_numbers,
**kwargs,
)
# Apply logging config for API/SDK usage.
Expand Down Expand Up @@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result(
{
"index": region.get("index", 0),
"label": region.get("label", "text"),
"native_label": region.get("label", "text"),
"content": region.get("content", ""),
"bbox_2d": bbox,
"layout_index": region.get("index", 0),
"layout_score": float(region.get("score") or 0.0),
}
)
json_result.append(page_result)
Expand All @@ -460,12 +465,32 @@ def _maas_response_to_pipeline_result(
source,
)

page_metadata = None
page_number_candidates = None
document_page_numbering = None
if self.config_model.pipeline.result_formatter.detect_printed_page_numbers:
from glmocr.postprocess import ResultFormatter

formatter = ResultFormatter(self.config_model.pipeline.result_formatter)
(
page_number_candidates,
document_page_numbering,
page_metadata,
) = formatter.extract_printed_page_data(json_result)

from glmocr.postprocess import ResultFormatter

ResultFormatter._strip_layout_metadata(json_result)

# Create PipelineResult
result = PipelineResult(
json_result=json_result,
markdown_result=markdown_result,
original_images=[source],
image_files=image_files or None,
page_metadata=page_metadata,
page_number_candidates=page_number_candidates,
document_page_numbering=document_page_numbering,
)

# Store additional MaaS response data
Expand Down
6 changes: 6 additions & 0 deletions glmocr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]:
"LAYOUT_CUDA_VISIBLE_DEVICES": "pipeline.layout.cuda_visible_devices",
# Explicit device for layout model: "cpu", "cuda", "cuda:0", etc.
"LAYOUT_DEVICE": "pipeline.layout.device",
# Result formatter
"DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers",
# Logging
"LOG_LEVEL": "logging.level",
}
Expand Down Expand Up @@ -175,6 +177,7 @@ class ResultFormatterConfig(_BaseConfig):
enable_merge_formula_numbers: bool = True
enable_merge_text_blocks: bool = True
enable_format_bullet_points: bool = True
detect_printed_page_numbers: bool = False
label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict)


Expand Down Expand Up @@ -260,6 +263,8 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any:
# Boolean fields
if dotted_path == "pipeline.maas.enabled":
return raw.strip().lower() in ("maas", "true", "1", "yes")
if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers":
return raw.strip().lower() in ("true", "1", "yes", "on")
# Integer fields
if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")):
return int(raw)
Expand Down Expand Up @@ -429,6 +434,7 @@ def from_env(
"mode": "pipeline.maas.enabled",
"timeout": "pipeline.maas.request_timeout",
"log_level": "logging.level",
"detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers",
# Self-hosted OCR API
"ocr_api_host": "pipeline.ocr_api.api_host",
"ocr_api_port": "pipeline.ocr_api.api_port",
Expand Down
3 changes: 2 additions & 1 deletion glmocr/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ pipeline:
- content
- doc_title
- figure_title
- number
- paragraph_title
- reference_content
- text
Expand Down Expand Up @@ -256,6 +257,7 @@ pipeline:
- content
- doc_title
- figure_title
- number
- paragraph_title
- reference_content
- text
Expand All @@ -274,7 +276,6 @@ pipeline:
abandon:
- header
- footer
- number
- footnote
- aside_text
- reference
Expand Down
36 changes: 36 additions & 0 deletions glmocr/parser_result/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ def __init__(
original_images: Optional[List[str]] = None,
image_files: Optional[Dict[str, Any]] = None,
raw_json_result: Optional[list] = None,
page_metadata: Optional[List[Dict[str, Any]]] = None,
page_number_candidates: Optional[List[Dict[str, Any]]] = None,
document_page_numbering: Optional[Dict[str, Any]] = None,
):
"""Initialize.

Expand All @@ -41,6 +44,9 @@ def __init__(
regions, to be saved under ``imgs/`` during :meth:`save`.
raw_json_result: Raw model output before post-processing;
saved as ``{name}_model.json`` alongside the final result.
page_metadata: Derived per-page printed page metadata.
page_number_candidates: Raw printed page-number candidates.
document_page_numbering: Document-level numbering inference.
"""
if isinstance(json_result, str):
try:
Expand All @@ -56,6 +62,9 @@ def __init__(
]
self.image_files = image_files
self.raw_json_result = raw_json_result
self.page_metadata = page_metadata
self.page_number_candidates = page_number_candidates
self.document_page_numbering = document_page_numbering

@abstractmethod
def save(
Expand Down Expand Up @@ -88,6 +97,27 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None:
json_data = json.loads(json_data)
except json.JSONDecodeError:
pass

has_printed_page_data = (
bool(self.page_metadata)
or bool(self.page_number_candidates)
or self.document_page_numbering is not None
)

if has_printed_page_data:
json_data = {
"json_result": json_data,
"page_metadata": (
self.page_metadata if self.page_metadata is not None else []
),
"page_number_candidates": (
self.page_number_candidates
if self.page_number_candidates is not None
else []
),
"document_page_numbering": self.document_page_numbering,
}

with open(json_file, "w", encoding="utf-8") as f:
if isinstance(json_data, (dict, list)):
json.dump(json_data, f, ensure_ascii=False, indent=2)
Expand Down Expand Up @@ -134,6 +164,12 @@ def to_dict(self) -> dict:
"markdown_result": self.markdown_result or "",
"original_images": self.original_images,
}
if self.page_metadata is not None:
d["page_metadata"] = self.page_metadata
if self.page_number_candidates is not None:
d["page_number_candidates"] = self.page_number_candidates
if self.document_page_numbering is not None:
d["document_page_numbering"] = self.document_page_numbering
# Include optional metadata set by MaaS mode.
for attr in ("_usage", "_data_info", "_error"):
val = getattr(self, attr, None)
Expand Down
9 changes: 9 additions & 0 deletions glmocr/parser_result/pipeline_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def __init__(
image_files: Optional[dict] = None,
raw_json_result: Optional[list] = None,
layout_vis_images: Optional[Dict[int, Any]] = None,
page_metadata: Optional[List[Dict[str, Any]]] = None,
page_number_candidates: Optional[List[Dict[str, Any]]] = None,
document_page_numbering: Optional[Dict[str, Any]] = None,
):
"""Initialize.

Expand All @@ -38,13 +41,19 @@ def __init__(
raw_json_result: Raw model output before post-processing (optional).
layout_vis_images: Mapping of ``page_idx`` → PIL Image for layout
visualization; saved to ``layout_vis/`` during :meth:`save`.
page_metadata: Derived per-page printed page metadata.
page_number_candidates: Raw printed page-number candidates.
document_page_numbering: Document-level numbering inference.
"""
super().__init__(
json_result=json_result,
markdown_result=markdown_result,
original_images=original_images,
image_files=image_files,
raw_json_result=raw_json_result,
page_metadata=page_metadata,
page_number_candidates=page_number_candidates,
document_page_numbering=document_page_numbering,
)
self.layout_vis_images = layout_vis_images

Expand Down
Loading
Loading