From 75a5d181df7172a168bd2fd0a6b50ffaedac16c3 Mon Sep 17 00:00:00 2001
From: Jakub Kowalski <kuba@pathway.com>
Date: Thu, 13 Feb 2025 13:42:11 +0100
Subject: [PATCH] Cleanup dependencies (#8212)

GitOrigin-RevId: 68dadb185be7b52368f19432dfde44ebde9ac3d8
---
 pyproject.toml                       |  6 +++---
 python/pathway/xpacks/llm/parsers.py | 19 +++++++++++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fd5753ec..49d17501 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,8 +55,8 @@ xpack-llm = [
     "litellm ~= 1.44.28",
     "cohere ~= 5.1.0",
     "tiktoken >= 0.5",
-    "langchain == 0.2.0",
-    "langchain_community == 0.2.0",
+    "langchain ~= 0.2.0",
+    "langchain_community ~= 0.2.0",
     "llama-index-core ~= 0.10.0",
     "llama-index-readers-pathway ~= 0.1.0",
     "llama-index-retrievers-pathway ~= 0.1.3",
@@ -64,7 +64,6 @@ xpack-llm = [
     "instructor == 1.2.6",
     "google-generativeai ~= 0.8.4",
     "google-api-core ~= 2.24.1",
-    "docling >= 2.15, <3.0",
 ]
 xpack-llm-local = [  # requirements that allow local ML inference
     "unstructured[all-docs] >= 0.16, < 0.16.15",
@@ -72,6 +71,7 @@ xpack-llm-local = [  # requirements that allow local ML inference
     "transformers >= 4.42.0",
 ]
 xpack-llm-docs = [
+    "docling >= 2.15, <3.0",
     "python-docx >= 1.1.2",
     "unstructured >= 0.16, < 0.16.12",
     "pdf2image",
diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py
index 026136d9..dc87f9e5 100644
--- a/python/pathway/xpacks/llm/parsers.py
+++ b/python/pathway/xpacks/llm/parsers.py
@@ -354,7 +354,12 @@ def __init__(
     ):
         with optional_imports("xpack-llm-docs"):
             from docling.datamodel.pipeline_options import PdfPipelineOptions
-            from docling.document_converter import DocumentConverter, PdfFormatOption
+            from docling.document_converter import (
+                DocumentConverter,
+                InputFormat,
+                PdfFormatOption,
+            )
+            from docling_core.types.doc import ImageRefMode
 
         self.multimodal_llm: llms.OpenAIChat | llms.LiteLLMChat | None
         self.parse_images = parse_images
@@ -371,11 +376,15 @@ def __init__(
                     retry_strategy=udfs.ExponentialBackoffRetryStrategy(max_retries=4),
                     verbose=True,
                 )
-            self.image_mode = "embedded"  # will make docling export document to markdown with base64-embedded images
+            self.image_mode = (
+                ImageRefMode.EMBEDDED
+            )  # will make docling export document to markdown with base64-embedded images
             self.multimodal_llm = multimodal_llm
         else:
             self.multimodal_llm = None
-            self.image_mode = "placeholder"  # will make docling export document to markdown with image placeholders
+            self.image_mode = (
+                ImageRefMode.PLACEHOLDER
+            )  # will make docling export document to markdown with image placeholders
 
         default_pipeline_options = {
             "do_table_structure": True,
@@ -392,7 +401,9 @@ def __init__(
 
         # actual docling converter
         self.converter: DocumentConverter = DocumentConverter(
-            format_options={"pdf": PdfFormatOption(pipeline_options=pipeline_options)},
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+            },
             # TODO: Add more file types
         )
         super().__init__(cache_strategy=cache_strategy)