Skip to content

Commit

Permalink
Make python-docx an optional dependency
Browse files Browse the repository at this point in the history
- Add unit tests (testing on `paper.docx` which is a copy of paper.pdf)
- Minor improvements
  • Loading branch information
taabishm2 committed Sep 14, 2024
1 parent 6515493 commit 5814c04
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 23 deletions.
26 changes: 17 additions & 9 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
from typing import Literal, overload

import docx
import os
import pymupdf
import tiktoken
from html2text import __version__ as html2text_version
Expand Down Expand Up @@ -34,7 +34,14 @@ def parse_pdf_to_pages(path: Path) -> ParsedText:
)
return ParsedText(content=pages, metadata=metadata)

def parse_docx_to_text(path: Path) -> ParsedText:
def parse_docx_to_text(path: str | os.PathLike) -> ParsedText:
try:
import docx
except ImportError as e:
raise ImportError(
"python-docx is required for reading docx files. Please install using:"
" `pip3 install paper-qa[python-docx]`."
) from e
doc = docx.Document(path)
text = "\n".join([para.text for para in doc.paragraphs])
metadata = ParsedMetadata(
Expand Down Expand Up @@ -280,17 +287,18 @@ def read_doc(
parsed_text_only: return parsed text without chunking
include_metadata: return a tuple
"""
str_path = str(path)
# Convert to lowercase for case-insensitive file extension matching
str_path = str(path).lower()
parsed_text = None

# start with parsing -- users may want to store this separately
if str_path.lower().endswith(".pdf"):
if str_path.endswith(".pdf"):
parsed_text = parse_pdf_to_pages(path)
elif str_path.lower().endswith((".doc", ".docx")):
elif str_path.endswith((".doc", ".docx")):
parsed_text = parse_docx_to_text(path)
elif str_path.lower().endswith(".txt"):
elif str_path.endswith(".txt"):
parsed_text = parse_text(path)
elif str_path.lower().endswith(".html"):
elif str_path.endswith(".html"):
parsed_text = parse_text(path, html=True)
else:
parsed_text = parse_text(path, split_lines=True, use_tiktoken=False)
Expand All @@ -299,14 +307,14 @@ def read_doc(
return parsed_text

# next chunk the parsed text
if str_path.lower().endswith(".pdf"):
if str_path.endswith(".pdf"):
chunked_text = chunk_pdf(
parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
)
chunk_metadata = ChunkMetadata(
chunk_chars=chunk_chars, overlap=overlap, chunk_type="overlap_pdf_by_page"
)
elif str_path.lower().endswith((".txt", ".html", ".doc", ".docx")):
elif str_path.endswith((".txt", ".html", ".doc", ".docx")):
chunked_text = chunk_text(
parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
)
Expand Down
22 changes: 11 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,19 @@ classifiers = [
]
dependencies = [
"PyCryptodome",
"aiohttp",
# TODO: remove in favor of httpx
"aiohttp", # TODO: remove in favor of httpx
"anyio",
"fhaviary[llm]>=0.6",
# For info on Message
"html2text",
# TODO: evaluate moving to an opt-in dependency
"fhaviary[llm]>=0.6", # For info on Message
"html2text", # TODO: evaluate moving to an opt-in dependency
"httpx",
"litellm>=1.44",
# to prevent sys.stdout on router creation
"litellm>=1.44", # to prevent sys.stdout on router creation
"numpy",
"pybtex",
"pydantic-settings",
"pydantic~=2.0",
"pymupdf",
"python-docx>=1.1.2",
"rich",
"setuptools",
# TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib
"setuptools", # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib
"tantivy",
"tenacity",
"tiktoken>=0.4.0",
Expand Down Expand Up @@ -69,6 +63,9 @@ typing = [
zotero = [
"pyzotero",
]
docx = [
"python-docx",
]

[project.scripts]
pqa = "paperqa.agents:main"
Expand Down Expand Up @@ -146,7 +143,9 @@ module = [
"datasets", # SEE: https://github.com/huggingface/datasets/issues/3841
"litellm", # SEE: https://github.com/BerriAI/litellm/issues/825
"pybtex.*", # SEE: https://bitbucket.org/pybtex-devs/pybtex/issues/141/type-annotations
"pymupdf", # SEE: https://github.com/pymupdf/PyMuPDF/issues/3361
"pyzotero", # SEE: https://github.com/urschrei/pyzotero/issues/110
"docx", # SEE: https://github.com/python-openxml/python-docx/issues/1432
]

[tool.pylint]
Expand Down Expand Up @@ -418,6 +417,7 @@ dev-dependencies = [
"pytest>=8", # Pin to keep recent
"python-dotenv",
"pyzotero",
"python-docx",
"refurb>=2", # Pin to keep recent
"requests",
]
Binary file added tests/stub_data/paper.docx
Binary file not shown.
Binary file removed tests/stub_data/paper.pdf
Binary file not shown.
41 changes: 41 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,3 +1018,44 @@ def test_external_doc_index(stub_data_dir: Path) -> None:
docs2 = Docs(texts_index=docs.texts_index)
assert not docs2.docs
assert docs2.get_evidence("What is the date of flag day?").contexts


def test_docx_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
docs = Docs()
docs.add(stub_data_dir / "paper.docx", "Wellawatte et al, XAI Review, 2023")
# doc will be a DocDetails object, but nothing can be found
# thus, we retain the prior citation data
assert (
next(iter(docs.docs.values())).citation == "Wellawatte et al, XAI Review, 2023"
)


def test_docx_reader_match_doc_details(stub_data_dir: Path) -> None:
doc_path = stub_data_dir / "paper.docx"
docs = Docs()
# we limit to only crossref since s2 is too flaky
docs.add(
doc_path,
"Wellawatte et al, A Perspective on Explanations of Molecular Prediction"
" Models, XAI Review, 2023",
use_doc_details=True,
clients={CrossrefProvider},
fields=["author", "journal"],
)
doc_details = next(iter(docs.docs.values()))
assert doc_details.dockey in {
"41f786fcc56d27ff0c1507153fae3774", # From file contents
"5300ef1d5fb960d7", # Or from crossref data
}
# note year is unknown because citation string is only parsed for authors/title/doi
# AND we do not request it back from the metadata sources
assert doc_details.docname == "wellawatteUnknownyearaperspectiveon"
assert set(doc_details.authors) == { # type: ignore[attr-defined]
"Geemi P. Wellawatte",
"Heta A. Gandhi",
"Aditi Seshadri",
"Andrew D. White",
}
assert doc_details.doi == "10.26434/chemrxiv-2022-qfv02" # type: ignore[attr-defined]
answer = docs.query("Are counterfactuals actionable? [yes/no]")
assert "yes" in answer.answer or "Yes" in answer.answer
10 changes: 7 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5814c04

Please sign in to comment.