diff --git a/paperqa/readers.py b/paperqa/readers.py index faca366f..432fc568 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Literal, overload -import docx +import os import pymupdf import tiktoken from html2text import __version__ as html2text_version @@ -34,7 +34,14 @@ def parse_pdf_to_pages(path: Path) -> ParsedText: ) return ParsedText(content=pages, metadata=metadata) -def parse_docx_to_text(path: Path) -> ParsedText: +def parse_docx_to_text(path: str | os.PathLike) -> ParsedText: + try: + import docx + except ImportError as e: + raise ImportError( + "python-docx is required for reading docx files. Please install using:" + " `pip3 install paper-qa[python-docx]`." + ) from e doc = docx.Document(path) text = "\n".join([para.text for para in doc.paragraphs]) metadata = ParsedMetadata( @@ -280,17 +287,18 @@ def read_doc( parsed_text_only: return parsed text without chunking include_metadata: return a tuple """ - str_path = str(path) + # Convert to lowercase for case-insensitive file extension matching + str_path = str(path).lower() parsed_text = None # start with parsing -- users may want to store this separately - if str_path.lower().endswith(".pdf"): + if str_path.endswith(".pdf"): parsed_text = parse_pdf_to_pages(path) - elif str_path.lower().endswith((".doc", ".docx")): + elif str_path.endswith((".doc", ".docx")): parsed_text = parse_docx_to_text(path) - elif str_path.lower().endswith(".txt"): + elif str_path.endswith(".txt"): parsed_text = parse_text(path) - elif str_path.lower().endswith(".html"): + elif str_path.endswith(".html"): parsed_text = parse_text(path, html=True) else: parsed_text = parse_text(path, split_lines=True, use_tiktoken=False) @@ -299,14 +307,14 @@ def read_doc( return parsed_text # next chunk the parsed text - if str_path.lower().endswith(".pdf"): + if str_path.endswith(".pdf"): chunked_text = chunk_pdf( parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap ) chunk_metadata = ChunkMetadata( chunk_chars=chunk_chars, overlap=overlap, chunk_type="overlap_pdf_by_page" ) - elif str_path.lower().endswith((".txt", ".html", ".doc", ".docx")): + elif str_path.endswith((".txt", ".html", ".doc", ".docx")): chunked_text = chunk_text( parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap ) diff --git a/pyproject.toml b/pyproject.toml index 368982b1..093738ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,25 +18,19 @@ classifiers = [ ] dependencies = [ "PyCryptodome", - "aiohttp", - # TODO: remove in favor of httpx + "aiohttp", # TODO: remove in favor of httpx "anyio", - "fhaviary[llm]>=0.6", - # For info on Message - "html2text", - # TODO: evaluate moving to an opt-in dependency + "fhaviary[llm]>=0.6", # For info on Message + "html2text", # TODO: evaluate moving to an opt-in dependency "httpx", - "litellm>=1.44", - # to prevent sys.stdout on router creation + "litellm>=1.44", # to prevent sys.stdout on router creation "numpy", "pybtex", "pydantic-settings", "pydantic~=2.0", "pymupdf", - "python-docx>=1.1.2", "rich", - "setuptools", - # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib + "setuptools", # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib "tantivy", "tenacity", "tiktoken>=0.4.0", @@ -69,6 +63,9 @@ typing = [ zotero = [ "pyzotero", ] +docx = [ + "python-docx", +] [project.scripts] pqa = "paperqa.agents:main" @@ -146,7 +143,9 @@ module = [ "datasets", # SEE: https://github.com/huggingface/datasets/issues/3841 "litellm", # SEE: https://github.com/BerriAI/litellm/issues/825 "pybtex.*", # SEE: https://bitbucket.org/pybtex-devs/pybtex/issues/141/type-annotations + "pymupdf", # SEE: https://github.com/pymupdf/PyMuPDF/issues/3361 "pyzotero", # SEE: https://github.com/urschrei/pyzotero/issues/110 + "docx", # SEE: https://github.com/python-openxml/python-docx/issues/1432 ] [tool.pylint] @@ -418,6 +417,7 @@ dev-dependencies = [ "pytest>=8", # Pin to keep recent "python-dotenv", "pyzotero", + "python-docx", "refurb>=2", # Pin to keep recent "requests", ] diff --git a/tests/stub_data/paper.docx b/tests/stub_data/paper.docx new file mode 100644 index 00000000..c8d65e1b Binary files /dev/null and b/tests/stub_data/paper.docx differ diff --git a/tests/stub_data/paper.pdf b/tests/stub_data/paper.pdf deleted file mode 100644 index e211006d..00000000 Binary files a/tests/stub_data/paper.pdf and /dev/null differ diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index c44adedd..16ada0c6 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1018,3 +1018,44 @@ def test_external_doc_index(stub_data_dir: Path) -> None: docs2 = Docs(texts_index=docs.texts_index) assert not docs2.docs assert docs2.get_evidence("What is the date of flag day?").contexts + + +def test_docx_reader_w_no_match_doc_details(stub_data_dir: Path) -> None: + docs = Docs() + docs.add(stub_data_dir / "paper.docx", "Wellawatte et al, XAI Review, 2023") + # doc will be a DocDetails object, but nothing can be found + # thus, we retain the prior citation data + assert ( + next(iter(docs.docs.values())).citation == "Wellawatte et al, XAI Review, 2023" + ) + + +def test_docx_reader_match_doc_details(stub_data_dir: Path) -> None: + doc_path = stub_data_dir / "paper.docx" + docs = Docs() + # we limit to only crossref since s2 is too flaky + docs.add( + doc_path, + "Wellawatte et al, A Perspective on Explanations of Molecular Prediction" + " Models, XAI Review, 2023", + use_doc_details=True, + clients={CrossrefProvider}, + fields=["author", "journal"], + ) + doc_details = next(iter(docs.docs.values())) + assert doc_details.dockey in { + "41f786fcc56d27ff0c1507153fae3774", # From file contents + "5300ef1d5fb960d7", # Or from crossref data + } + # note year is unknown because citation string is only parsed for authors/title/doi + # AND we do not request it back from the metadata sources + assert doc_details.docname == "wellawatteUnknownyearaperspectiveon" + assert set(doc_details.authors) == { # type: ignore[attr-defined] + "Geemi P. Wellawatte", + "Heta A. Gandhi", + "Aditi Seshadri", + "Andrew D. White", + } + assert doc_details.doi == "10.26434/chemrxiv-2022-qfv02" # type: ignore[attr-defined] + answer = docs.query("Are counterfactuals actionable? [yes/no]") + assert "yes" in answer.answer or "Yes" in answer.answer diff --git a/uv.lock b/uv.lock index ee011660..bc4afd3a 100644 --- a/uv.lock +++ b/uv.lock @@ -1276,7 +1276,7 @@ wheels = [ [[package]] name = "paper-qa" -version = "5.0.3.dev3+gcd82b52.d20240913" +version = "5.0.3.dev4+g6515493.d20240914" source = { editable = "." } dependencies = [ { name = "aiohttp" }, @@ -1291,7 +1291,6 @@ dependencies = [ { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pymupdf" }, - { name = "python-docx" }, { name = "rich" }, { name = "setuptools" }, { name = "tantivy" }, @@ -1303,6 +1302,9 @@ dependencies = [ datasets = [ { name = "datasets" }, ] +docx = [ + { name = "python-docx" }, +] ldp = [ { name = "ldp" }, ] @@ -1334,6 +1336,7 @@ dev = [ { name = "pytest-sugar" }, { name = "pytest-timer", extra = ["colorama"] }, { name = "pytest-xdist" }, + { name = "python-docx" }, { name = "python-dotenv" }, { name = "pyzotero" }, { name = "refurb" }, @@ -1359,7 +1362,7 @@ requires-dist = [ { name = "pydantic", specifier = "~=2.0" }, { name = "pydantic-settings" }, { name = "pymupdf" }, - { name = "python-docx", specifier = ">=1.1.2" }, + { name = "python-docx", marker = "extra == 'docx'" }, { name = "pyzotero", marker = "extra == 'zotero'" }, { name = "rich" }, { name = "setuptools" }, @@ -1387,6 +1390,7 @@ dev = [ { name = "pytest-sugar" }, { name = "pytest-timer", extras = ["colorama"] }, { name = "pytest-xdist" }, + { name = "python-docx" }, { name = "python-dotenv" }, { name = "pyzotero" }, { name = "refurb", specifier = ">=2" },