Skip to content

Commit 18c58db

Browse files
feat: Add multimodal support for Office documents
1 parent e837ebe commit 18c58db

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ zotero = [
110110
"paper-qa-pymupdf",
111111
"pyzotero",
112112
]
113+
office = [
114+
"unstructured[docx,xlsx,pptx]",
115+
]
113116

114117
[project.scripts]
115118
pqa = "paperqa.agents:main"

src/paperqa/readers.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import tiktoken
1212
from html2text import __version__ as html2text_version
1313
from html2text import html2text
14+
from unstructured.documents.elements import Image, Table
15+
from unstructured.partition.auto import partition
1416

1517
from paperqa.types import (
1618
ChunkMetadata,
@@ -171,6 +173,53 @@ def parse_text(
171173
)
172174

173175

176+
def parse_office_doc(
177+
path: str | os.PathLike,
178+
page_size_limit: int | None = None,
179+
**kwargs,
180+
) -> ParsedText:
181+
"""Parse office documents (.docx, .xlsx, .pptx) using unstructured, extracting text and images."""
182+
183+
elements = partition(str(path), **kwargs)
184+
185+
content_dict = {}
186+
media_list = []
187+
current_text = ""
188+
media_index = 0
189+
190+
for el in elements:
191+
if isinstance(el, Image):
192+
if el.metadata.image_data:
193+
image_data = el.metadata.image_data
194+
# Create a ParsedMedia object
195+
parsed_media = ParsedMedia(
196+
index=media_index,
197+
data=image_data,
198+
info={"suffix": el.metadata.image_mime_type},
199+
)
200+
media_list.append(parsed_media)
201+
media_index += 1
202+
elif isinstance(el, Table):
203+
# For tables, we could get the HTML representation for better structure
204+
current_text += el.metadata.text_as_html + "\n\n"
205+
else:
206+
current_text += str(el) + "\n\n"
207+
208+
# For office docs, we can treat the whole document as a single "page"
209+
content_dict["1"] = (current_text, media_list)
210+
211+
return ParsedText(
212+
content=content_dict,
213+
metadata=ParsedMetadata(
214+
parsing_libraries=["unstructured"],
215+
paperqa_version=pqa_version,
216+
total_parsed_text_length=len(current_text),
217+
count_parsed_media=len(media_list),
218+
name=f"office_doc|path={path}",
219+
),
220+
)
221+
222+
174223
def chunk_text(
175224
parsed_text: ParsedText,
176225
doc: Doc,
@@ -276,7 +325,7 @@ def chunk_code_text(
276325

277326
IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"})
278327
# When HTML reader supports images, add here
279-
ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS})
328+
ENRICHMENT_EXTENSIONS = tuple({".pdf", ".docx", ".xlsx", ".pptx", *IMAGE_EXTENSIONS})
280329

281330

282331
@overload
@@ -383,6 +432,9 @@ async def read_doc( # noqa: PLR0912
383432
)
384433
elif str_path.endswith(IMAGE_EXTENSIONS):
385434
parsed_text = await parse_image(path, **parser_kwargs)
435+
elif str_path.endswith((".docx", ".xlsx", ".pptx")):
436+
# TODO: Make parse_office_doc async
437+
parsed_text = await asyncio.to_thread(parse_office_doc, path, **parser_kwargs)
386438
else:
387439
parsed_text = await asyncio.to_thread(
388440
parse_text, path, split_lines=True, **parser_kwargs

0 commit comments

Comments
 (0)