Skip to content
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,24 @@ Depending on the source document, the same image can appear multiple times
Thus, clients should consider media databases
to have a many-to-many relationship with chunks.

Since PaperQA's evidence gathering process centers on text-based retrieval,
it's possible relevant image(s) or table(s) aren't retrieved
because their associated text content is irrelevant.
For a concrete example, imagine the figure in a paper has a terse caption
and is placed one page after relevant main-text discussion.
To solve this problem, PaperQA supports media enrichment at document read-time.
Basically after reading in the PDF,
the `parsing.enrichment_llm` is given the `parsing.enrichment_prompt`
and co-located text to generate a synthetic caption for every image/table.
The synthetic captions are used to shift the embeddings of each text chunk,
but are kept separate from the actual source text.
This way evidence gathering can fetch relevant images/tables
without risk of polluting contextual summaries with LLM-generated captions.

If you want multimodal PDF reading, but do not want enrichment
(since adds one LLM prompt/media at read-time),
enrichment can be disabled by setting `parsing.multimodal` to `ON_WITHOUT_ENRICHMENT`.

When creating contextual summaries on a given chunk (a `Text`),
the summary LLM is passed both the chunk's text and the chunk's associated media,
but the output contextual summary itself remains text-only.
Expand Down Expand Up @@ -926,13 +944,17 @@ will return much faster than the first query and we'll be certain the authors ma
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
| `parsing.overlap` | `250` | Characters to overlap chunks. |
| `parsing.multimodal` | `True` | Flag to parse both text and images from applicable documents. |
| `parsing.multimodal` | `True` | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
| `parsing.configure_pdf_parser` | No-op | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |
| `parsing.chunking_algorithm` | `ChunkingOptions.SIMPLE_OVERLAP` | Algorithm for chunking. |
| `parsing.doc_filters` | `None` | Optional filters for allowed documents. |
| `parsing.use_human_readable_clinical_trials` | `False` | Parse clinical trial JSONs into readable text. |
| `parsing.enrichment_llm` | `"gpt-4o-2024-11-20"` | LLM for media enrichment. |
| `parsing.enrichment_llm_config` | `None` | Optional configuration for `enrichment_llm`. |
| `parsing.enrichment_page_radius` | `1` | Page radius for context text in enrichment. |
| `parsing.enrichment_prompt` | `image_enrichment_prompt_template` | Prompt template for enriching media. |
| `prompt.summary` | `summary_prompt` | Template for summarizing text, must contain variables matching `summary_prompt`. |
| `prompt.qa` | `qa_prompt` | Template for QA, must contain variables matching `qa_prompt`. |
| `prompt.select` | `select_paper_prompt` | Template for selecting papers, must contain variables matching `select_paper_prompt`. |
Expand Down
8 changes: 5 additions & 3 deletions src/paperqa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,17 +232,19 @@ async def _map_fxn_summary( # noqa: PLR0912
cleaned_text = text.text.strip("\n") or "(no text)"
if summary_llm_model and prompt_templates:
unique_media = list(dict.fromkeys(text.media)) # Preserve order
media_text: list[str] = [m.text for m in unique_media if m.text]
table_texts: list[str] = [
m.text for m in unique_media if m.info.get("type") == "table" and m.text
]
data = {
"question": question,
"citation": citation,
"text": (
text_with_tables_prompt_template.format(
text=cleaned_text,
citation=citation,
tables="\n\n".join(media_text),
tables="\n\n".join(table_texts),
)
if media_text
if table_texts
else cleaned_text
),
} | (extra_prompt_data or {})
Expand Down
35 changes: 30 additions & 5 deletions src/paperqa/docs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
import json
import logging
import os
Expand Down Expand Up @@ -387,16 +388,22 @@ async def aadd( # noqa: PLR0912
doc, **(query_kwargs | kwargs)
)

parse_images, enrich_media = parse_config.should_parse_and_enrich_media
multimodal_kwargs: dict[str, Any] = {"parse_images": parse_images}
if enrich_media:
multimodal_kwargs["multimodal_enricher"] = (
all_settings.make_media_enricher()
)
texts, metadata = await read_doc(
path,
doc,
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
parse_images=parse_config.multimodal,
parse_pdf=parse_config.parse_pdf,
include_metadata=True,
**multimodal_kwargs,
)
# loose check to see if document was loaded
if metadata.name != "image" and (
Expand Down Expand Up @@ -480,7 +487,16 @@ async def aadd_texts(
if embedding_model and texts[0].embedding is None:
for t, t_embedding in zip(
texts,
await embedding_model.embed_documents(texts=[t.text for t in texts]),
await embedding_model.embed_documents(
texts=await asyncio.gather(
*(
t.get_embeddable_text(
all_settings.parsing.should_parse_and_enrich_media[1]
)
for t in texts
)
)
),
strict=True,
):
t.embedding = t_embedding
Expand Down Expand Up @@ -534,14 +550,20 @@ def delete(
self.deleted_dockeys.add(dockey)
self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))

async def _build_texts_index(self, embedding_model: EmbeddingModel) -> None:
async def _build_texts_index(
self, embedding_model: EmbeddingModel, with_enrichment: bool = False
) -> None:
texts = [t for t in self.texts if t not in self.texts_index]
# For any embeddings we are supposed to lazily embed, embed them now
to_embed = [t for t in texts if t.embedding is None]
if to_embed:
for t, t_embedding in zip(
to_embed,
await embedding_model.embed_documents(texts=[t.text for t in to_embed]),
await embedding_model.embed_documents(
texts=await asyncio.gather(
*(t.get_embeddable_text(with_enrichment) for t in to_embed)
)
),
strict=True,
):
t.embedding = t_embedding
Expand All @@ -563,7 +585,10 @@ async def retrieve_texts(
# TODO: should probably happen elsewhere
self.texts_index.mmr_lambda = settings.texts_index_mmr_lambda

await self._build_texts_index(embedding_model)
await self._build_texts_index(
embedding_model,
with_enrichment=settings.parsing.should_parse_and_enrich_media[1],
)
_k = k + len(self.deleted_dockeys)
matches: list[Text] = cast(
"list[Text]",
Expand Down
36 changes: 36 additions & 0 deletions src/paperqa/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,39 @@
EMPTY_CONTEXTS = len(CONTEXT_OUTER_PROMPT.format(context_str="", valid_keys="").strip())
CONTEXT_INNER_PROMPT_NOT_DETAILED = "{name}: {text}"
CONTEXT_INNER_PROMPT = f"{CONTEXT_INNER_PROMPT_NOT_DETAILED}\nFrom {{citation}}"

# For reference, here's Docling's image description prompt:
# https://github.com/docling-project/docling/blob/v2.55.1/docling/datamodel/pipeline_options.py#L214-L216
media_enrichment_prompt_template = (
"You are analyzing an image or table from a scientific document."
" Provide a detailed description that will be used to answer questions about its content."
" Focus on key elements, data, relationships, and scientific insights visible in the image."
" It's especially important to document referential information such as"
" figure/table numbers, labels, plot colors, or legends."
"\n\nText co-located with the media may be associated with"
" other media or unrelated content,"
" so do not just blindly quote referential information."
" The smaller the image, the more likely co-located text is unrelated."
" To restate, often the co-located text is several pages of content,"
" so only use aspects relevant to accompanying image or table."
"\n\nHere's a few failure mode with possible resolutions:"
"\n- The media was a logo or icon, so the text is unrelated."
" In this case, briefly describe the media as a logo or icon,"
" and do not mention other unrelated surrounding text."
"\n- The media was display type, so the text is probably unrelated."
" The display type can be spread over several lines."
" In this case, briefly describe the media as display type,"
" and do not mention other unrelated surrounding text."
"\n- The media is a margin box or design element, so the text is unrelated."
" In this case, briefly describe the media as decorative,"
" and do not mention other unrelated surrounding text."
"\n- The media came from a bad PDF read, so it's garbled."
" In this case, describe the media as garbled, state why it's considered garbled,"
" and do not mention other unrelated surrounding text."
"\n- The media is a subfigure or a subtable."
" In this case, make sure to only detail the subfigure or subtable,"
" not the entire figure or table."
" Do not mention other unrelated surrounding text."
"\n\n{context_text}Describe the media," # Allow for empty context_text
" or if uncertain on a description please state why:"
)
31 changes: 26 additions & 5 deletions src/paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ def chunk_code_text(


IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"})
# When HTML reader supports images, add here
ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS})


@overload
Expand All @@ -285,6 +287,7 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
parse_pdf: PDFParserFn | None = ...,
**parser_kwargs,
) -> ParsedText: ...
Expand All @@ -296,6 +299,7 @@ async def read_doc(
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
parse_pdf: PDFParserFn | None = ...,
**parser_kwargs,
) -> ParsedText: ...
Expand All @@ -307,6 +311,7 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
parse_pdf: PDFParserFn | None = ...,
**parser_kwargs,
) -> tuple[list[Text], ParsedMetadata]: ...
Expand All @@ -318,6 +323,7 @@ async def read_doc(
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
parse_pdf: PDFParserFn | None = ...,
**parser_kwargs,
) -> list[Text]: ...
Expand All @@ -329,6 +335,8 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
image_enrichment_pages: int | bool = ...,
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
parse_pdf: PDFParserFn | None = ...,
**parser_kwargs,
) -> tuple[list[Text], ParsedMetadata]: ...
Expand All @@ -339,6 +347,7 @@ async def read_doc( # noqa: PLR0912
include_metadata: bool = False,
chunk_chars: int = 3000,
overlap: int = 100,
multimodal_enricher: Callable[[ParsedText], Awaitable[str]] | None = None,
parse_pdf: PDFParserFn | None = None,
**parser_kwargs,
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
Expand All @@ -351,6 +360,8 @@ async def read_doc( # noqa: PLR0912
include_metadata: Opt-in flag to include metadata about the chunking algorithm.
chunk_chars: size of chunks
overlap: size of overlap between chunks
multimodal_enricher: Optional function to enrich the parsed text
and return a hashable string summary before chunking.
parse_pdf: Optional function to parse PDF files (if you're parsing a PDF).
parser_kwargs: Keyword arguments to pass to the used parsing function.
"""
Expand Down Expand Up @@ -380,6 +391,13 @@ async def read_doc( # noqa: PLR0912
if parsed_text_only:
return parsed_text

# Enrich upon full parsed text before chunking, since enrichment
# may view adjacent pages (and not getting cut off on chunk boundaries)
if str_path.endswith(ENRICHMENT_EXTENSIONS) and multimodal_enricher:
enrichment_summary: str = f"|{await multimodal_enricher(parsed_text)}"
else:
enrichment_summary = ""

# next chunk the parsed text

if chunk_chars == 0:
Expand All @@ -389,7 +407,10 @@ async def read_doc( # noqa: PLR0912
chunk_metadata = ChunkMetadata(
size=0,
overlap=0,
name=f"paper-qa={pqa_version}|algorithm=none|reduction=cl100k_base",
name=(
f"paper-qa={pqa_version}|algorithm=none"
f"|reduction=cl100k_base{enrichment_summary}"
),
)
elif str_path.endswith(".pdf"):
chunked_text = chunk_pdf(
Expand All @@ -400,7 +421,7 @@ async def read_doc( # noqa: PLR0912
overlap=overlap,
name=(
f"paper-qa={pqa_version}|algorithm=overlap-pdf"
f"|size={chunk_chars}|overlap={overlap}"
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
),
)
elif str_path.endswith(IMAGE_EXTENSIONS):
Expand All @@ -410,7 +431,7 @@ async def read_doc( # noqa: PLR0912
chunk_metadata = ChunkMetadata(
size=0,
overlap=0,
name=f"paper-qa={pqa_version}|algorithm=none",
name=f"paper-qa={pqa_version}|algorithm=none{enrichment_summary}",
)
elif str_path.endswith((".txt", ".html")):
chunked_text = chunk_text(
Expand All @@ -421,7 +442,7 @@ async def read_doc( # noqa: PLR0912
overlap=overlap,
name=(
f"paper-qa={pqa_version}|algorithm=overlap-text|reduction=cl100k_base"
f"|size={chunk_chars}|overlap={overlap}"
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
),
)
else:
Expand All @@ -433,7 +454,7 @@ async def read_doc( # noqa: PLR0912
overlap=overlap,
name=(
f"paper-qa={pqa_version}|algorithm=overlap-code|reduction=cl100k_base"
f"|size={chunk_chars}|overlap={overlap}"
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
),
)

Expand Down
Loading