diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index eeeb7b5..04e17d9 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -247,7 +247,16 @@ def process( user_msg["content"].append( {"type": "image_url", "image_url": {"url": data_url}} ) - per_request = self.page_loader.build_request(per_request) + # Set default parameters without calling build_request(), + # which would re-process the already-encoded image through + # load_image_to_base64 a second time. + per_request.setdefault("max_tokens", self.page_loader.max_tokens) + per_request.setdefault("temperature", self.page_loader.temperature) + per_request.setdefault("top_p", self.page_loader.top_p) + per_request.setdefault("top_k", self.page_loader.top_k) + per_request.setdefault( + "repetition_penalty", self.page_loader.repetition_penalty + ) response, status_code = self.ocr_client.process(per_request) if status_code != 200: raise Exception( diff --git a/glmocr/utils/__init__.py b/glmocr/utils/__init__.py index 684efa9..7b217c4 100644 --- a/glmocr/utils/__init__.py +++ b/glmocr/utils/__init__.py @@ -12,17 +12,23 @@ configure_logging, set_log_level, ) -from .visualization_utils import ( - draw_layout_boxes, - save_layout_visualization, - get_colormap, -) from .result_postprocess_utils import ( find_consecutive_repeat, clean_repeated_content, clean_formula_number, ) + +def __getattr__(name): + # Lazy imports for layout-only symbols that require opencv-python. + _viz_names = {"draw_layout_boxes", "save_layout_visualization", "get_colormap"} + if name in _viz_names: + from . import visualization_utils + + return getattr(visualization_utils, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + __all__ = [ "smart_resize", "load_image_to_base64", diff --git a/glmocr/utils/image_utils.py b/glmocr/utils/image_utils.py index e228cbb..0ebc58a 100644 --- a/glmocr/utils/image_utils.py +++ b/glmocr/utils/image_utils.py @@ -1,7 +1,6 @@ """Image processing utilities.""" import io -import cv2 import base64 import math from io import BytesIO @@ -191,6 +190,7 @@ def crop_image_region(image, bbox_2d, polygon=None, fill_color=255): Returns: PIL.Image.Image: Cropped region with optional polygon mask applied """ + import cv2 image_width, image_height = image.size # De-normalize bbox to pixel coordinates diff --git a/pyproject.toml b/pyproject.toml index 97f0fef..e4a2499 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ {name = "ZHIPUAI", email = "info@zhipuai.cn"} ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" license = {text = "Apache-2.0"} keywords = ["ocr", "glm", "ai", "vision"] classifiers = [ @@ -18,10 +18,10 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] dependencies = [ @@ -34,19 +34,8 @@ dependencies = [ "portalocker>=2.8.2", "python-dotenv>=0.21.0", - # Layout detection - "torch>=2.0.0", - "torchvision>=0.15.0", - "transformers>=5.1.0", - "sentencepiece>=0.1.99", - "accelerate>=0.20.0", - "opencv-python>=4.8.0", - # PDF support "pypdfium2>=5.3.0", - - # Flask server - "flask>=2.3.0", ] [project.optional-dependencies] @@ -68,14 +57,7 @@ server = [ ] all = [ - "torch>=2.0.0", - "torchvision>=0.15.0", - "transformers>=4.30.0", - "sentencepiece>=0.1.99", - "accelerate>=0.20.0", - "opencv-python>=4.8.0", - "pdf2image>=1.16.0", - "flask>=2.3.0", + "glmocr[layout,server]", ] dev = [ "pytest>=7.0.0", @@ -100,7 +82,7 @@ include = ["glmocr*"] [tool.black] line-length = 100 -target-version = ["py38", "py39", "py310", "py311"] +target-version = ["py310", "py311", "py312", "py313"] [tool.pytest.ini_options] testpaths = ["glmocr/tests"]