Make python-docx an optional dependency

- Add unit tests (testing on `paper.docx` which is a copy of paper.pdf) - Minor improvements
Future-House · Sep 14, 2024 · 5814c04 · 5814c04
1 parent 6515493
commit 5814c04
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 23 deletions.
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Literal, overload
 
-import docx
+import os
 import pymupdf
 import tiktoken
 from html2text import __version__ as html2text_version
@@ -34,7 +34,14 @@ def parse_pdf_to_pages(path: Path) -> ParsedText:
     )
     return ParsedText(content=pages, metadata=metadata)
 
-def parse_docx_to_text(path: Path) -> ParsedText:
+def parse_docx_to_text(path: str | os.PathLike) -> ParsedText:
+    try:
+        import docx
+    except ImportError as e:
+        raise ImportError(
+        "python-docx is required for reading docx files. Please install using:"
+        " `pip3 install paper-qa[python-docx]`."
+    ) from e
     doc = docx.Document(path)
     text = "\n".join([para.text for para in doc.paragraphs])
     metadata = ParsedMetadata(
@@ -280,17 +287,18 @@ def read_doc(
         parsed_text_only: return parsed text without chunking
         include_metadata: return a tuple
     """
-    str_path = str(path)
+    # Convert to lowercase for case-insensitive file extension matching
+    str_path = str(path).lower()
     parsed_text = None
 
     # start with parsing -- users may want to store this separately
-    if str_path.lower().endswith(".pdf"):
+    if str_path.endswith(".pdf"):
         parsed_text = parse_pdf_to_pages(path)
-    elif str_path.lower().endswith((".doc", ".docx")):
+    elif str_path.endswith((".doc", ".docx")):
         parsed_text = parse_docx_to_text(path)
-    elif str_path.lower().endswith(".txt"):
+    elif str_path.endswith(".txt"):
         parsed_text = parse_text(path)
-    elif str_path.lower().endswith(".html"):
+    elif str_path.endswith(".html"):
         parsed_text = parse_text(path, html=True)
     else:
         parsed_text = parse_text(path, split_lines=True, use_tiktoken=False)
@@ -299,14 +307,14 @@ def read_doc(
         return parsed_text
 
     # next chunk the parsed text
-    if str_path.lower().endswith(".pdf"):
+    if str_path.endswith(".pdf"):
         chunked_text = chunk_pdf(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
         )
         chunk_metadata = ChunkMetadata(
             chunk_chars=chunk_chars, overlap=overlap, chunk_type="overlap_pdf_by_page"
         )
-    elif str_path.lower().endswith((".txt", ".html", ".doc", ".docx")):
+    elif str_path.endswith((".txt", ".html", ".doc", ".docx")):
         chunked_text = chunk_text(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
         )

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,25 +18,19 @@ classifiers = [
 ]
 dependencies = [
     "PyCryptodome",
-    "aiohttp",
-    # TODO: remove in favor of httpx
+    "aiohttp",  # TODO: remove in favor of httpx
     "anyio",
-    "fhaviary[llm]>=0.6",
-    # For info on Message
-    "html2text",
-    # TODO: evaluate moving to an opt-in dependency
+    "fhaviary[llm]>=0.6",  # For info on Message
+    "html2text",  # TODO: evaluate moving to an opt-in dependency
     "httpx",
-    "litellm>=1.44",
-    # to prevent sys.stdout on router creation
+    "litellm>=1.44",  # to prevent sys.stdout on router creation
     "numpy",
     "pybtex",
     "pydantic-settings",
     "pydantic~=2.0",
     "pymupdf",
-    "python-docx>=1.1.2",
     "rich",
-    "setuptools",
-    # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib
+    "setuptools",  # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib
     "tantivy",
     "tenacity",
     "tiktoken>=0.4.0",
@@ -69,6 +63,9 @@ typing = [
 zotero = [
     "pyzotero",
 ]
+docx = [
+    "python-docx",
+]
 
 [project.scripts]
 pqa = "paperqa.agents:main"
@@ -146,7 +143,9 @@ module = [
     "datasets",  # SEE: https://github.com/huggingface/datasets/issues/3841
     "litellm",  # SEE: https://github.com/BerriAI/litellm/issues/825
     "pybtex.*",  # SEE: https://bitbucket.org/pybtex-devs/pybtex/issues/141/type-annotations
+    "pymupdf",  # SEE: https://github.com/pymupdf/PyMuPDF/issues/3361
     "pyzotero",  # SEE: https://github.com/urschrei/pyzotero/issues/110
+    "docx",  # SEE: https://github.com/python-openxml/python-docx/issues/1432
 ]
 
 [tool.pylint]
@@ -418,6 +417,7 @@ dev-dependencies = [
     "pytest>=8",  # Pin to keep recent
     "python-dotenv",
     "pyzotero",
+    "python-docx",
     "refurb>=2",  # Pin to keep recent
     "requests",
 ]
diff --git a/tests/stub_data/paper.docx b/tests/stub_data/paper.docx
diff --git a/tests/stub_data/paper.pdf b/tests/stub_data/paper.pdf
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1018,3 +1018,44 @@ def test_external_doc_index(stub_data_dir: Path) -> None:
     docs2 = Docs(texts_index=docs.texts_index)
     assert not docs2.docs
     assert docs2.get_evidence("What is the date of flag day?").contexts
+
+
+def test_docx_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
+    docs = Docs()
+    docs.add(stub_data_dir / "paper.docx", "Wellawatte et al, XAI Review, 2023")
+    # doc will be a DocDetails object, but nothing can be found
+    # thus, we retain the prior citation data
+    assert (
+        next(iter(docs.docs.values())).citation == "Wellawatte et al, XAI Review, 2023"
+    )
+
+
+def test_docx_reader_match_doc_details(stub_data_dir: Path) -> None:
+    doc_path = stub_data_dir / "paper.docx"
+    docs = Docs()
+    # we limit to only crossref since s2 is too flaky
+    docs.add(
+        doc_path,
+        "Wellawatte et al, A Perspective on Explanations of Molecular Prediction"
+        " Models, XAI Review, 2023",
+        use_doc_details=True,
+        clients={CrossrefProvider},
+        fields=["author", "journal"],
+    )
+    doc_details = next(iter(docs.docs.values()))
+    assert doc_details.dockey in {
+        "41f786fcc56d27ff0c1507153fae3774",  # From file contents
+        "5300ef1d5fb960d7",  # Or from crossref data
+    }
+    # note year is unknown because citation string is only parsed for authors/title/doi
+    # AND we do not request it back from the metadata sources
+    assert doc_details.docname == "wellawatteUnknownyearaperspectiveon"
+    assert set(doc_details.authors) == {  # type: ignore[attr-defined]
+        "Geemi P. Wellawatte",
+        "Heta A. Gandhi",
+        "Aditi Seshadri",
+        "Andrew D. White",
+    }
+    assert doc_details.doi == "10.26434/chemrxiv-2022-qfv02"  # type: ignore[attr-defined]
+    answer = docs.query("Are counterfactuals actionable? [yes/no]")
+    assert "yes" in answer.answer or "Yes" in answer.answer
diff --git a/uv.lock b/uv.lock