|
11 | 11 | import tiktoken |
12 | 12 | from html2text import __version__ as html2text_version |
13 | 13 | from html2text import html2text |
| 14 | +from unstructured.documents.elements import Image, Table |
| 15 | +from unstructured.partition.auto import partition |
14 | 16 |
|
15 | 17 | from paperqa.types import ( |
16 | 18 | ChunkMetadata, |
@@ -171,6 +173,53 @@ def parse_text( |
171 | 173 | ) |
172 | 174 |
|
173 | 175 |
|
| 176 | +def parse_office_doc( |
| 177 | + path: str | os.PathLike, |
| 178 | + page_size_limit: int | None = None, |
| 179 | + **kwargs, |
| 180 | +) -> ParsedText: |
| 181 | + """Parse office documents (.docx, .xlsx, .pptx) using unstructured, extracting text and images.""" |
| 182 | + |
| 183 | + elements = partition(str(path), **kwargs) |
| 184 | + |
| 185 | + content_dict = {} |
| 186 | + media_list = [] |
| 187 | + current_text = "" |
| 188 | + media_index = 0 |
| 189 | + |
| 190 | + for el in elements: |
| 191 | + if isinstance(el, Image): |
| 192 | + if el.metadata.image_data: |
| 193 | + image_data = el.metadata.image_data |
| 194 | + # Create a ParsedMedia object |
| 195 | + parsed_media = ParsedMedia( |
| 196 | + index=media_index, |
| 197 | + data=image_data, |
| 198 | + info={"suffix": el.metadata.image_mime_type}, |
| 199 | + ) |
| 200 | + media_list.append(parsed_media) |
| 201 | + media_index += 1 |
| 202 | + elif isinstance(el, Table): |
| 203 | + # For tables, we could get the HTML representation for better structure |
| 204 | + current_text += el.metadata.text_as_html + "\n\n" |
| 205 | + else: |
| 206 | + current_text += str(el) + "\n\n" |
| 207 | + |
| 208 | + # For office docs, we can treat the whole document as a single "page" |
| 209 | + content_dict["1"] = (current_text, media_list) |
| 210 | + |
| 211 | + return ParsedText( |
| 212 | + content=content_dict, |
| 213 | + metadata=ParsedMetadata( |
| 214 | + parsing_libraries=["unstructured"], |
| 215 | + paperqa_version=pqa_version, |
| 216 | + total_parsed_text_length=len(current_text), |
| 217 | + count_parsed_media=len(media_list), |
| 218 | + name=f"office_doc|path={path}", |
| 219 | + ), |
| 220 | + ) |
| 221 | + |
| 222 | + |
174 | 223 | def chunk_text( |
175 | 224 | parsed_text: ParsedText, |
176 | 225 | doc: Doc, |
@@ -276,7 +325,7 @@ def chunk_code_text( |
276 | 325 |
|
277 | 326 | IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"}) |
278 | 327 | # When HTML reader supports images, add here |
279 | | -ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS}) |
| 328 | +ENRICHMENT_EXTENSIONS = tuple({".pdf", ".docx", ".xlsx", ".pptx", *IMAGE_EXTENSIONS}) |
280 | 329 |
|
281 | 330 |
|
282 | 331 | @overload |
@@ -383,6 +432,9 @@ async def read_doc( # noqa: PLR0912 |
383 | 432 | ) |
384 | 433 | elif str_path.endswith(IMAGE_EXTENSIONS): |
385 | 434 | parsed_text = await parse_image(path, **parser_kwargs) |
| 435 | + elif str_path.endswith((".docx", ".xlsx", ".pptx")): |
| 436 | + # TODO: Make parse_office_doc async |
| 437 | + parsed_text = await asyncio.to_thread(parse_office_doc, path, **parser_kwargs) |
386 | 438 | else: |
387 | 439 | parsed_text = await asyncio.to_thread( |
388 | 440 | parse_text, path, split_lines=True, **parser_kwargs |
|
0 commit comments