From 75a5d181df7172a168bd2fd0a6b50ffaedac16c3 Mon Sep 17 00:00:00 2001 From: Jakub Kowalski Date: Thu, 13 Feb 2025 13:42:11 +0100 Subject: [PATCH] Cleanup dependencies (#8212) GitOrigin-RevId: 68dadb185be7b52368f19432dfde44ebde9ac3d8 --- pyproject.toml | 6 +++--- python/pathway/xpacks/llm/parsers.py | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fd5753ec..49d17501 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,8 +55,8 @@ xpack-llm = [ "litellm ~= 1.44.28", "cohere ~= 5.1.0", "tiktoken >= 0.5", - "langchain == 0.2.0", - "langchain_community == 0.2.0", + "langchain ~= 0.2.0", + "langchain_community ~= 0.2.0", "llama-index-core ~= 0.10.0", "llama-index-readers-pathway ~= 0.1.0", "llama-index-retrievers-pathway ~= 0.1.3", @@ -64,7 +64,6 @@ xpack-llm = [ "instructor == 1.2.6", "google-generativeai ~= 0.8.4", "google-api-core ~= 2.24.1", - "docling >= 2.15, <3.0", ] xpack-llm-local = [ # requirements that allow local ML inference "unstructured[all-docs] >= 0.16, < 0.16.15", @@ -72,6 +71,7 @@ xpack-llm-local = [ # requirements that allow local ML inference "transformers >= 4.42.0", ] xpack-llm-docs = [ + "docling >= 2.15, <3.0", "python-docx >= 1.1.2", "unstructured >= 0.16, < 0.16.12", "pdf2image", diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py index 026136d9..dc87f9e5 100644 --- a/python/pathway/xpacks/llm/parsers.py +++ b/python/pathway/xpacks/llm/parsers.py @@ -354,7 +354,12 @@ def __init__( ): with optional_imports("xpack-llm-docs"): from docling.datamodel.pipeline_options import PdfPipelineOptions - from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.document_converter import ( + DocumentConverter, + InputFormat, + PdfFormatOption, + ) + from docling_core.types.doc import ImageRefMode self.multimodal_llm: llms.OpenAIChat | llms.LiteLLMChat | None self.parse_images = parse_images @@ -371,11 +376,15 @@ def __init__( retry_strategy=udfs.ExponentialBackoffRetryStrategy(max_retries=4), verbose=True, ) - self.image_mode = "embedded" # will make docling export document to markdown with base64-embedded images + self.image_mode = ( + ImageRefMode.EMBEDDED + ) # will make docling export document to markdown with base64-embedded images self.multimodal_llm = multimodal_llm else: self.multimodal_llm = None - self.image_mode = "placeholder" # will make docling export document to markdown with image placeholders + self.image_mode = ( + ImageRefMode.PLACEHOLDER + ) # will make docling export document to markdown with image placeholders default_pipeline_options = { "do_table_structure": True, @@ -392,7 +401,9 @@ def __init__( # actual docling converter self.converter: DocumentConverter = DocumentConverter( - format_options={"pdf": PdfFormatOption(pipeline_options=pipeline_options)}, + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + }, # TODO: Add more file types ) super().__init__(cache_strategy=cache_strategy)