22import logging
33import ftfy
44import sys
5+ import os
6+ import tempfile
7+ from typing import BinaryIO
58
69from langchain_community .document_loaders import (
710 AzureAIDocumentIntelligenceLoader ,
8588
8689
8790class TikaLoader :
88- def __init__ (self , url , file_path , mime_type = None ):
91+ def __init__ (self , url , file_stream , mime_type = None ):
8992 self .url = url
90- self .file_path = file_path
93+ self .file_stream = file_stream
9194 self .mime_type = mime_type
9295
9396 def load (self ) -> list [Document ]:
94- with open (self .file_path , "rb" ) as f :
95- data = f .read ()
9697
9798 if self .mime_type is not None :
9899 headers = {"Content-Type" : self .mime_type }
@@ -104,7 +105,7 @@ def load(self) -> list[Document]:
104105 endpoint += "/"
105106 endpoint += "tika/text"
106107
107- r = requests .put (endpoint , data = data , headers = headers )
108+ r = requests .put (endpoint , data = self . file_stream , headers = headers )
108109
109110 if r .ok :
110111 raw_metadata = r .json ()
@@ -119,30 +120,30 @@ def load(self) -> list[Document]:
119120 else :
120121 raise Exception (f"Error calling Tika: { r .reason } " )
121122
122-
123123class DoclingLoader :
124- def __init__ (self , url , file_path = None , mime_type = None ):
124+ def __init__ (self , url , file_name , file_stream = None , mime_type = None ):
125125 self .url = url .rstrip ("/" )
126- self .file_path = file_path
126+ self .file_stream = file_stream
127127 self .mime_type = mime_type
128+ self .file_name = file_name
128129
129130 def load (self ) -> list [Document ]:
130- with open ( self . file_path , "rb" ) as f :
131- files = {
132- "files" : (
133- self .file_path ,
134- f ,
135- self .mime_type or "application/octet-stream" ,
136- )
137- }
138-
139- params = {
140- "image_export_mode" : "placeholder" ,
141- "table_mode" : "accurate" ,
142- }
143-
144- endpoint = f"{ self .url } /v1alpha/convert/file"
145- r = requests .post (endpoint , files = files , data = params )
131+
132+ files = {
133+ "files" : (
134+ self .file_name ,
135+ self . file_stream ,
136+ self .mime_type or "application/octet-stream" ,
137+ )
138+ }
139+
140+ params = {
141+ "image_export_mode" : "placeholder" ,
142+ "table_mode" : "accurate" ,
143+ }
144+
145+ endpoint = f"{ self .url } /v1alpha/convert/file"
146+ r = requests .post (endpoint , files = files , data = params )
146147
147148 if r .ok :
148149 result = r .json ()
@@ -172,10 +173,48 @@ def __init__(self, engine: str = "", **kwargs):
172173 self .kwargs = kwargs
173174
174175 def load (
175- self , filename : str , file_content_type : str , file_path : str
176+ self , filename : str , file_content_type : str , file_stream : BinaryIO
176177 ) -> list [Document ]:
177- loader = self ._get_loader (filename , file_content_type , file_path )
178- docs = loader .load ()
178+
179+ if self .engine == "tika" and self .kwargs .get ("TIKA_SERVER_URL" ):
180+ log .debug (f"Using Tika loader with server URL: { self .kwargs .get ('TIKA_SERVER_URL' )} " )
181+ loader = TikaLoader (
182+ url = self .kwargs .get ("TIKA_SERVER_URL" ),
183+ file_stream = file_stream ,
184+ mime_type = file_content_type ,
185+ )
186+ docs = loader .load ()
187+
188+ elif self .engine == "docling" and self .kwargs .get ("DOCLING_API_URL" ):
189+ log .debug (f"Using Docling loader with server URL: { self .kwargs .get ('DOCLING_API_URL' )} " )
190+ loader = DoclingLoader (
191+ url = self .kwargs .get ("DOCLING_SERVER_URL" ),
192+ file_name = filename ,
193+ file_stream = file_stream ,
194+ mime_type = file_content_type ,
195+ )
196+ docs = loader .load ()
197+
198+ elif (
199+ self .engine == "mistral_ocr"
200+ and self .kwargs .get ("MISTRAL_OCR_API_KEY" ) != ""
201+ and file_ext
202+ in ["pdf" ] # Mistral OCR currently only supports PDF and images
203+ ):
204+ loader = MistralLoader (
205+ api_key = self .kwargs .get ("MISTRAL_OCR_API_KEY" ), file_name = filename , file_stream = file_stream
206+ )
207+
208+ else :
209+ log .info (f"Using local file loader for: { filename } with content type: { file_content_type } " )
210+ file_ext = filename .split ("." )[- 1 ].lower ()
211+ with tempfile .NamedTemporaryFile (delete = True , suffix = f".{ file_ext } " ) as tmp_file :
212+ tmp_file .write (file_stream .read ())
213+ tmp_file .flush ()
214+ tmp_path = tmp_file .name
215+ loader = self ._get_loader (filename , file_content_type , tmp_path )
216+ docs = loader .load ()
217+
179218
180219 return [
181220 Document (
@@ -191,26 +230,8 @@ def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
191230
192231 def _get_loader (self , filename : str , file_content_type : str , file_path : str ):
193232 file_ext = filename .split ("." )[- 1 ].lower ()
194-
195- if self .engine == "tika" and self .kwargs .get ("TIKA_SERVER_URL" ):
196- if self ._is_text_file (file_ext , file_content_type ):
197- loader = TextLoader (file_path , autodetect_encoding = True )
198- else :
199- loader = TikaLoader (
200- url = self .kwargs .get ("TIKA_SERVER_URL" ),
201- file_path = file_path ,
202- mime_type = file_content_type ,
203- )
204- elif self .engine == "docling" and self .kwargs .get ("DOCLING_SERVER_URL" ):
205- if self ._is_text_file (file_ext , file_content_type ):
206- loader = TextLoader (file_path , autodetect_encoding = True )
207- else :
208- loader = DoclingLoader (
209- url = self .kwargs .get ("DOCLING_SERVER_URL" ),
210- file_path = file_path ,
211- mime_type = file_content_type ,
212- )
213- elif (
233+
234+ if (
214235 self .engine == "document_intelligence"
215236 and self .kwargs .get ("DOCUMENT_INTELLIGENCE_ENDPOINT" ) != ""
216237 and self .kwargs .get ("DOCUMENT_INTELLIGENCE_KEY" ) != ""
@@ -231,53 +252,43 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
231252 api_endpoint = self .kwargs .get ("DOCUMENT_INTELLIGENCE_ENDPOINT" ),
232253 api_key = self .kwargs .get ("DOCUMENT_INTELLIGENCE_KEY" ),
233254 )
255+ elif file_ext == "pdf" :
256+ loader = PyPDFLoader (
257+ file_path , extract_images = self .kwargs .get ("PDF_EXTRACT_IMAGES" )
258+ )
259+ elif file_ext == "csv" :
260+ loader = CSVLoader (file_path , autodetect_encoding = True )
261+ elif file_ext == "rst" :
262+ loader = UnstructuredRSTLoader (file_path , mode = "elements" )
263+ elif file_ext == "xml" :
264+ loader = UnstructuredXMLLoader (file_path )
265+ elif file_ext in ["htm" , "html" ]:
266+ loader = BSHTMLLoader (file_path , open_encoding = "unicode_escape" )
267+ elif file_ext == "md" :
268+ loader = TextLoader (file_path , autodetect_encoding = True )
269+ elif file_content_type == "application/epub+zip" :
270+ loader = UnstructuredEPubLoader (file_path )
234271 elif (
235- self .engine == "mistral_ocr"
236- and self .kwargs .get ("MISTRAL_OCR_API_KEY" ) != ""
237- and file_ext
238- in ["pdf" ] # Mistral OCR currently only supports PDF and images
272+ file_content_type
273+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
274+ or file_ext == "docx"
239275 ):
240- loader = MistralLoader (
241- api_key = self .kwargs .get ("MISTRAL_OCR_API_KEY" ), file_path = file_path
242- )
276+ loader = Docx2txtLoader (file_path )
277+ elif file_content_type in [
278+ "application/vnd.ms-excel" ,
279+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ,
280+ ] or file_ext in ["xls" , "xlsx" ]:
281+ loader = UnstructuredExcelLoader (file_path )
282+ elif file_content_type in [
283+ "application/vnd.ms-powerpoint" ,
284+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
285+ ] or file_ext in ["ppt" , "pptx" ]:
286+ loader = UnstructuredPowerPointLoader (file_path )
287+ elif file_ext == "msg" :
288+ loader = OutlookMessageLoader (file_path )
289+ elif self ._is_text_file (file_ext , file_content_type ):
290+ loader = TextLoader (file_path , autodetect_encoding = True )
243291 else :
244- if file_ext == "pdf" :
245- loader = PyPDFLoader (
246- file_path , extract_images = self .kwargs .get ("PDF_EXTRACT_IMAGES" )
247- )
248- elif file_ext == "csv" :
249- loader = CSVLoader (file_path , autodetect_encoding = True )
250- elif file_ext == "rst" :
251- loader = UnstructuredRSTLoader (file_path , mode = "elements" )
252- elif file_ext == "xml" :
253- loader = UnstructuredXMLLoader (file_path )
254- elif file_ext in ["htm" , "html" ]:
255- loader = BSHTMLLoader (file_path , open_encoding = "unicode_escape" )
256- elif file_ext == "md" :
257- loader = TextLoader (file_path , autodetect_encoding = True )
258- elif file_content_type == "application/epub+zip" :
259- loader = UnstructuredEPubLoader (file_path )
260- elif (
261- file_content_type
262- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
263- or file_ext == "docx"
264- ):
265- loader = Docx2txtLoader (file_path )
266- elif file_content_type in [
267- "application/vnd.ms-excel" ,
268- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ,
269- ] or file_ext in ["xls" , "xlsx" ]:
270- loader = UnstructuredExcelLoader (file_path )
271- elif file_content_type in [
272- "application/vnd.ms-powerpoint" ,
273- "application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
274- ] or file_ext in ["ppt" , "pptx" ]:
275- loader = UnstructuredPowerPointLoader (file_path )
276- elif file_ext == "msg" :
277- loader = OutlookMessageLoader (file_path )
278- elif self ._is_text_file (file_ext , file_content_type ):
279- loader = TextLoader (file_path , autodetect_encoding = True )
280- else :
281- loader = TextLoader (file_path , autodetect_encoding = True )
292+ loader = TextLoader (file_path , autodetect_encoding = True )
282293
283294 return loader
0 commit comments