Skip to content

Commit aa2ebed

Browse files
committed
File stream handling
Adapt File Storage Layer to handle files as Stream and don't store them on local FS Adapted: - Files Router - Audio Router - Retrieval Router - main Retrieval - Mistral Document Loader In Function Model added update Function for better updates Refs: - PRODAI-115
1 parent 4e32818 commit aa2ebed

File tree

6 files changed

+395
-314
lines changed

6 files changed

+395
-314
lines changed

backend/open_webui/retrieval/loaders/main.py

Lines changed: 103 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import logging
33
import ftfy
44
import sys
5+
import os
6+
import tempfile
7+
from typing import BinaryIO
58

69
from langchain_community.document_loaders import (
710
AzureAIDocumentIntelligenceLoader,
@@ -85,14 +88,12 @@
8588

8689

8790
class TikaLoader:
88-
def __init__(self, url, file_path, mime_type=None):
91+
def __init__(self, url, file_stream, mime_type=None):
8992
self.url = url
90-
self.file_path = file_path
93+
self.file_stream = file_stream
9194
self.mime_type = mime_type
9295

9396
def load(self) -> list[Document]:
94-
with open(self.file_path, "rb") as f:
95-
data = f.read()
9697

9798
if self.mime_type is not None:
9899
headers = {"Content-Type": self.mime_type}
@@ -104,7 +105,7 @@ def load(self) -> list[Document]:
104105
endpoint += "/"
105106
endpoint += "tika/text"
106107

107-
r = requests.put(endpoint, data=data, headers=headers)
108+
r = requests.put(endpoint, data=self.file_stream, headers=headers)
108109

109110
if r.ok:
110111
raw_metadata = r.json()
@@ -119,30 +120,30 @@ def load(self) -> list[Document]:
119120
else:
120121
raise Exception(f"Error calling Tika: {r.reason}")
121122

122-
123123
class DoclingLoader:
124-
def __init__(self, url, file_path=None, mime_type=None):
124+
def __init__(self, url, file_name, file_stream=None, mime_type=None):
125125
self.url = url.rstrip("/")
126-
self.file_path = file_path
126+
self.file_stream = file_stream
127127
self.mime_type = mime_type
128+
self.file_name = file_name
128129

129130
def load(self) -> list[Document]:
130-
with open(self.file_path, "rb") as f:
131-
files = {
132-
"files": (
133-
self.file_path,
134-
f,
135-
self.mime_type or "application/octet-stream",
136-
)
137-
}
138-
139-
params = {
140-
"image_export_mode": "placeholder",
141-
"table_mode": "accurate",
142-
}
143-
144-
endpoint = f"{self.url}/v1alpha/convert/file"
145-
r = requests.post(endpoint, files=files, data=params)
131+
132+
files = {
133+
"files": (
134+
self.file_name,
135+
self.file_stream,
136+
self.mime_type or "application/octet-stream",
137+
)
138+
}
139+
140+
params = {
141+
"image_export_mode": "placeholder",
142+
"table_mode": "accurate",
143+
}
144+
145+
endpoint = f"{self.url}/v1alpha/convert/file"
146+
r = requests.post(endpoint, files=files, data=params)
146147

147148
if r.ok:
148149
result = r.json()
@@ -172,10 +173,48 @@ def __init__(self, engine: str = "", **kwargs):
172173
self.kwargs = kwargs
173174

174175
def load(
175-
self, filename: str, file_content_type: str, file_path: str
176+
self, filename: str, file_content_type: str, file_stream: BinaryIO
176177
) -> list[Document]:
177-
loader = self._get_loader(filename, file_content_type, file_path)
178-
docs = loader.load()
178+
179+
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
180+
log.debug(f"Using Tika loader with server URL: {self.kwargs.get('TIKA_SERVER_URL')}")
181+
loader = TikaLoader(
182+
url=self.kwargs.get("TIKA_SERVER_URL"),
183+
file_stream=file_stream,
184+
mime_type=file_content_type,
185+
)
186+
docs = loader.load()
187+
188+
elif self.engine == "docling" and self.kwargs.get("DOCLING_API_URL"):
189+
log.debug(f"Using Docling loader with server URL: {self.kwargs.get('DOCLING_API_URL')}")
190+
loader = DoclingLoader(
191+
url=self.kwargs.get("DOCLING_SERVER_URL"),
192+
file_name=filename,
193+
file_stream=file_stream,
194+
mime_type=file_content_type,
195+
)
196+
docs = loader.load()
197+
198+
elif (
199+
self.engine == "mistral_ocr"
200+
and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
201+
and file_ext
202+
in ["pdf"] # Mistral OCR currently only supports PDF and images
203+
):
204+
loader = MistralLoader(
205+
api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_name=filename, file_stream=file_stream
206+
)
207+
208+
else:
209+
log.info(f"Using local file loader for: {filename} with content type: {file_content_type}")
210+
file_ext = filename.split(".")[-1].lower()
211+
with tempfile.NamedTemporaryFile(delete=True, suffix=f".{file_ext}") as tmp_file:
212+
tmp_file.write(file_stream.read())
213+
tmp_file.flush()
214+
tmp_path = tmp_file.name
215+
loader = self._get_loader(filename, file_content_type, tmp_path)
216+
docs = loader.load()
217+
179218

180219
return [
181220
Document(
@@ -191,26 +230,8 @@ def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
191230

192231
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
193232
file_ext = filename.split(".")[-1].lower()
194-
195-
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
196-
if self._is_text_file(file_ext, file_content_type):
197-
loader = TextLoader(file_path, autodetect_encoding=True)
198-
else:
199-
loader = TikaLoader(
200-
url=self.kwargs.get("TIKA_SERVER_URL"),
201-
file_path=file_path,
202-
mime_type=file_content_type,
203-
)
204-
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
205-
if self._is_text_file(file_ext, file_content_type):
206-
loader = TextLoader(file_path, autodetect_encoding=True)
207-
else:
208-
loader = DoclingLoader(
209-
url=self.kwargs.get("DOCLING_SERVER_URL"),
210-
file_path=file_path,
211-
mime_type=file_content_type,
212-
)
213-
elif (
233+
234+
if (
214235
self.engine == "document_intelligence"
215236
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
216237
and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
@@ -231,53 +252,43 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
231252
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
232253
api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
233254
)
255+
elif file_ext == "pdf":
256+
loader = PyPDFLoader(
257+
file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
258+
)
259+
elif file_ext == "csv":
260+
loader = CSVLoader(file_path, autodetect_encoding=True)
261+
elif file_ext == "rst":
262+
loader = UnstructuredRSTLoader(file_path, mode="elements")
263+
elif file_ext == "xml":
264+
loader = UnstructuredXMLLoader(file_path)
265+
elif file_ext in ["htm", "html"]:
266+
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
267+
elif file_ext == "md":
268+
loader = TextLoader(file_path, autodetect_encoding=True)
269+
elif file_content_type == "application/epub+zip":
270+
loader = UnstructuredEPubLoader(file_path)
234271
elif (
235-
self.engine == "mistral_ocr"
236-
and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
237-
and file_ext
238-
in ["pdf"] # Mistral OCR currently only supports PDF and images
272+
file_content_type
273+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
274+
or file_ext == "docx"
239275
):
240-
loader = MistralLoader(
241-
api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path
242-
)
276+
loader = Docx2txtLoader(file_path)
277+
elif file_content_type in [
278+
"application/vnd.ms-excel",
279+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
280+
] or file_ext in ["xls", "xlsx"]:
281+
loader = UnstructuredExcelLoader(file_path)
282+
elif file_content_type in [
283+
"application/vnd.ms-powerpoint",
284+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
285+
] or file_ext in ["ppt", "pptx"]:
286+
loader = UnstructuredPowerPointLoader(file_path)
287+
elif file_ext == "msg":
288+
loader = OutlookMessageLoader(file_path)
289+
elif self._is_text_file(file_ext, file_content_type):
290+
loader = TextLoader(file_path, autodetect_encoding=True)
243291
else:
244-
if file_ext == "pdf":
245-
loader = PyPDFLoader(
246-
file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
247-
)
248-
elif file_ext == "csv":
249-
loader = CSVLoader(file_path, autodetect_encoding=True)
250-
elif file_ext == "rst":
251-
loader = UnstructuredRSTLoader(file_path, mode="elements")
252-
elif file_ext == "xml":
253-
loader = UnstructuredXMLLoader(file_path)
254-
elif file_ext in ["htm", "html"]:
255-
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
256-
elif file_ext == "md":
257-
loader = TextLoader(file_path, autodetect_encoding=True)
258-
elif file_content_type == "application/epub+zip":
259-
loader = UnstructuredEPubLoader(file_path)
260-
elif (
261-
file_content_type
262-
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
263-
or file_ext == "docx"
264-
):
265-
loader = Docx2txtLoader(file_path)
266-
elif file_content_type in [
267-
"application/vnd.ms-excel",
268-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
269-
] or file_ext in ["xls", "xlsx"]:
270-
loader = UnstructuredExcelLoader(file_path)
271-
elif file_content_type in [
272-
"application/vnd.ms-powerpoint",
273-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
274-
] or file_ext in ["ppt", "pptx"]:
275-
loader = UnstructuredPowerPointLoader(file_path)
276-
elif file_ext == "msg":
277-
loader = OutlookMessageLoader(file_path)
278-
elif self._is_text_file(file_ext, file_content_type):
279-
loader = TextLoader(file_path, autodetect_encoding=True)
280-
else:
281-
loader = TextLoader(file_path, autodetect_encoding=True)
292+
loader = TextLoader(file_path, autodetect_encoding=True)
282293

283294
return loader

backend/open_webui/retrieval/loaders/mistral.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
import os
44
import sys
5-
from typing import List, Dict, Any
5+
from typing import List, Dict, Any, BinaryIO
66

77
from langchain_core.documents import Document
88
from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL
@@ -19,21 +19,20 @@ class MistralLoader:
1919

2020
BASE_API_URL = "https://api.mistral.ai/v1"
2121

22-
def __init__(self, api_key: str, file_path: str):
22+
def __init__(self, api_key: str,file_name: str, file_stream: BinaryIO):
2323
"""
2424
Initializes the loader.
2525
2626
Args:
2727
api_key: Your Mistral API key.
28-
file_path: The local path to the PDF file to process.
28+
file_stream: The file stream of the PDF file to process.
2929
"""
3030
if not api_key:
3131
raise ValueError("API key cannot be empty.")
32-
if not os.path.exists(file_path):
33-
raise FileNotFoundError(f"File not found at {file_path}")
3432

3533
self.api_key = api_key
36-
self.file_path = file_path
34+
self.file_stream = file_stream
35+
self.file_name = file_name
3736
self.headers = {"Authorization": f"Bearer {self.api_key}"}
3837

3938
def _handle_response(self, response: requests.Response) -> Dict[str, Any]:
@@ -58,18 +57,17 @@ def _upload_file(self) -> str:
5857
"""Uploads the file to Mistral for OCR processing."""
5958
log.info("Uploading file to Mistral API")
6059
url = f"{self.BASE_API_URL}/files"
61-
file_name = os.path.basename(self.file_path)
6260

6361
try:
64-
with open(self.file_path, "rb") as f:
65-
files = {"file": (file_name, f, "application/pdf")}
66-
data = {"purpose": "ocr"}
6762

68-
upload_headers = self.headers.copy() # Avoid modifying self.headers
63+
files = {"file": (self.file_name, self.file_stream, "application/pdf")}
64+
data = {"purpose": "ocr"}
6965

70-
response = requests.post(
71-
url, headers=upload_headers, files=files, data=data
72-
)
66+
upload_headers = self.headers.copy() # Avoid modifying self.headers
67+
68+
response = requests.post(
69+
url, headers=upload_headers, files=files, data=data
70+
)
7371

7472
response_data = self._handle_response(response)
7573
file_id = response_data.get("id")

backend/open_webui/routers/audio.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88
from pydub import AudioSegment
99
from pydub.silence import split_on_silence
10-
10+
from typing import BinaryIO
1111
import aiohttp
1212
import aiofiles
1313
import requests
@@ -484,6 +484,15 @@ async def speech(request: Request, user=Depends(get_verified_user)):
484484
return FileResponse(file_path)
485485

486486

487+
def transcribe_stream(request: Request, filename: str, file_stream: BinaryIO):
488+
file_ext = filename.split(".")[-1].lower()
489+
with tempfile.NamedTemporaryFile(delete=True, suffix=f".{file_ext}") as tmp_file:
490+
tmp_file.write(file_stream.read())
491+
tmp_file.flush()
492+
tmp_path = tmp_file.name
493+
transcribe(request, tmp_path)
494+
495+
487496
def transcribe(request: Request, file_path):
488497
log.info(f"transcribe: {file_path}")
489498
filename = os.path.basename(file_path)

0 commit comments

Comments
 (0)