Future-House · jamesbraza · Oct 29, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 11, 2025
diff --git a/README.md b/README.md
@@ -747,6 +747,24 @@ Depending on the source document, the same image can appear multiple times
 Thus, clients should consider media databases
 to have a many-to-many relationship with chunks.
 
+Since PaperQA's evidence gathering process centers on text-based retrieval,
+it's possible relevant image(s) or table(s) aren't retrieved
+because their associated text content is irrelevant.
+For a concrete example, imagine the figure in a paper has a terse caption
+and is placed one page after relevant main-text discussion.
+To solve this problem, PaperQA supports media enrichment at document read-time.
+Basically after reading in the PDF,
+the `parsing.enrichment_llm` is given the `parsing.enrichment_prompt`
+and co-located text to generate a synthetic caption for every image/table.
+The synthetic captions are used to shift the embeddings of each text chunk,
+but are kept separate from the actual source text.
+This way evidence gathering can fetch relevant images/tables
+without risk of polluting contextual summaries with LLM-generated captions.
+
+If you want multimodal PDF reading, but do not want enrichment
+(since adds one LLM prompt/media at read-time),
+enrichment can be disabled by setting `parsing.multimodal` to `ON_WITHOUT_ENRICHMENT`.
+
 When creating contextual summaries on a given chunk (a `Text`),
 the summary LLM is passed both the chunk's text and the chunk's associated media,
 but the output contextual summary itself remains text-only.
@@ -926,13 +944,17 @@ will return much faster than the first query and we'll be certain the authors ma
 | `parsing.pdfs_use_block_parsing`             | `False`                                | Opt-in flag for block-based PDF parsing over text-based PDF parsing.                                                          |
 | `parsing.use_doc_details`                    | `True`                                 | Whether to get metadata details for docs.                                                                                     |
 | `parsing.overlap`                            | `250`                                  | Characters to overlap chunks.                                                                                                 |
-| `parsing.multimodal`                         | `True`                                 | Flag to parse both text and images from applicable documents.                                                                 |
+| `parsing.multimodal`                         | `True`                                 | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
 | `parsing.defer_embedding`                    | `False`                                | Whether to defer embedding until summarization.                                                                               |
 | `parsing.parse_pdf`                          | `paperqa_pypdf.parse_pdf_to_pages`     | Function to parse PDF files.                                                                                                  |
 | `parsing.configure_pdf_parser`               | No-op                                  | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging.                       |
 | `parsing.chunking_algorithm`                 | `ChunkingOptions.SIMPLE_OVERLAP`       | Algorithm for chunking.                                                                                                       |
 | `parsing.doc_filters`                        | `None`                                 | Optional filters for allowed documents.                                                                                       |
 | `parsing.use_human_readable_clinical_trials` | `False`                                | Parse clinical trial JSONs into readable text.                                                                                |
+| `parsing.enrichment_llm`                     | `"gpt-4o-2024-11-20"`                  | LLM for media enrichment.                                                                                                     |
+| `parsing.enrichment_llm_config`              | `None`                                 | Optional configuration for `enrichment_llm`.                                                                                  |
+| `parsing.enrichment_page_radius`             | `1`                                    | Page radius for context text in enrichment.                                                                                   |
+| `parsing.enrichment_prompt`                  | `image_enrichment_prompt_template`     | Prompt template for enriching media.                                                                                          |
 | `prompt.summary`                             | `summary_prompt`                       | Template for summarizing text, must contain variables matching `summary_prompt`.                                              |
 | `prompt.qa`                                  | `qa_prompt`                            | Template for QA, must contain variables matching `qa_prompt`.                                                                 |
 | `prompt.select`                              | `select_paper_prompt`                  | Template for selecting papers, must contain variables matching `select_paper_prompt`.                                         |

diff --git a/src/paperqa/core.py b/src/paperqa/core.py
@@ -232,17 +232,19 @@ async def _map_fxn_summary(  # noqa: PLR0912
     cleaned_text = text.text.strip("\n") or "(no text)"
     if summary_llm_model and prompt_templates:
         unique_media = list(dict.fromkeys(text.media))  # Preserve order
-        media_text: list[str] = [m.text for m in unique_media if m.text]
+        table_texts: list[str] = [
+            m.text for m in unique_media if m.info.get("type") == "table" and m.text
+        ]
         data = {
             "question": question,
             "citation": citation,
             "text": (
                 text_with_tables_prompt_template.format(
                     text=cleaned_text,
                     citation=citation,
-                    tables="\n\n".join(media_text),
+                    tables="\n\n".join(table_texts),
                 )
-                if media_text
+                if table_texts
                 else cleaned_text
             ),
         } | (extra_prompt_data or {})

diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import json
 import logging
 import os
@@ -387,16 +388,22 @@ async def aadd(  # noqa: PLR0912
                 doc, **(query_kwargs | kwargs)
             )
 
+        parse_images, enrich_media = parse_config.should_parse_and_enrich_media
+        multimodal_kwargs: dict[str, Any] = {"parse_images": parse_images}
+        if enrich_media:
+            multimodal_kwargs["multimodal_enricher"] = (
+                all_settings.make_media_enricher()
+            )
         texts, metadata = await read_doc(
             path,
             doc,
             chunk_chars=parse_config.chunk_size,
             overlap=parse_config.overlap,
             page_size_limit=parse_config.page_size_limit,
             use_block_parsing=parse_config.pdfs_use_block_parsing,
-            parse_images=parse_config.multimodal,
             parse_pdf=parse_config.parse_pdf,
             include_metadata=True,
+            **multimodal_kwargs,
         )
         # loose check to see if document was loaded
         if metadata.name != "image" and (
@@ -480,7 +487,16 @@ async def aadd_texts(
         if embedding_model and texts[0].embedding is None:
             for t, t_embedding in zip(
                 texts,
-                await embedding_model.embed_documents(texts=[t.text for t in texts]),
+                await embedding_model.embed_documents(
+                    texts=await asyncio.gather(
+                        *(
+                            t.get_embeddable_text(
+                                all_settings.parsing.should_parse_and_enrich_media[1]
+                            )
+                            for t in texts
+                        )
+                    )
+                ),
                 strict=True,
             ):
                 t.embedding = t_embedding
@@ -534,14 +550,20 @@ def delete(
         self.deleted_dockeys.add(dockey)
         self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))
 
-    async def _build_texts_index(self, embedding_model: EmbeddingModel) -> None:
+    async def _build_texts_index(
+        self, embedding_model: EmbeddingModel, with_enrichment: bool = False
+    ) -> None:
         texts = [t for t in self.texts if t not in self.texts_index]
         # For any embeddings we are supposed to lazily embed, embed them now
         to_embed = [t for t in texts if t.embedding is None]
         if to_embed:
             for t, t_embedding in zip(
                 to_embed,
-                await embedding_model.embed_documents(texts=[t.text for t in to_embed]),
+                await embedding_model.embed_documents(
+                    texts=await asyncio.gather(
+                        *(t.get_embeddable_text(with_enrichment) for t in to_embed)
+                    )
+                ),
                 strict=True,
             ):
                 t.embedding = t_embedding
@@ -563,7 +585,10 @@ async def retrieve_texts(
         # TODO: should probably happen elsewhere
         self.texts_index.mmr_lambda = settings.texts_index_mmr_lambda
 
-        await self._build_texts_index(embedding_model)
+        await self._build_texts_index(
+            embedding_model,
+            with_enrichment=settings.parsing.should_parse_and_enrich_media[1],
+        )
         _k = k + len(self.deleted_dockeys)
         matches: list[Text] = cast(
             "list[Text]",

diff --git a/src/paperqa/prompts.py b/src/paperqa/prompts.py
@@ -162,3 +162,39 @@
 EMPTY_CONTEXTS = len(CONTEXT_OUTER_PROMPT.format(context_str="", valid_keys="").strip())
 CONTEXT_INNER_PROMPT_NOT_DETAILED = "{name}: {text}"
 CONTEXT_INNER_PROMPT = f"{CONTEXT_INNER_PROMPT_NOT_DETAILED}\nFrom {{citation}}"
+
+# For reference, here's Docling's image description prompt:
+# https://github.com/docling-project/docling/blob/v2.55.1/docling/datamodel/pipeline_options.py#L214-L216
+media_enrichment_prompt_template = (
+    "You are analyzing an image or table from a scientific document."
+    " Provide a detailed description that will be used to answer questions about its content."
+    " Focus on key elements, data, relationships, and scientific insights visible in the image."
+    " It's especially important to document referential information such as"
+    " figure/table numbers, labels, plot colors, or legends."
+    "\n\nText co-located with the media may be associated with"
+    " other media or unrelated content,"
+    " so do not just blindly quote referential information."
+    " The smaller the image, the more likely co-located text is unrelated."
+    " To restate, often the co-located text is several pages of content,"
+    " so only use aspects relevant to accompanying image or table."
+    "\n\nHere's a few failure mode with possible resolutions:"
+    "\n- The media was a logo or icon, so the text is unrelated."
+    " In this case, briefly describe the media as a logo or icon,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media was display type, so the text is probably unrelated."
+    " The display type can be spread over several lines."
+    " In this case, briefly describe the media as display type,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media is a margin box or design element, so the text is unrelated."
+    " In this case, briefly describe the media as decorative,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media came from a bad PDF read, so it's garbled."
+    " In this case, describe the media as garbled, state why it's considered garbled,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media is a subfigure or a subtable."
+    " In this case, make sure to only detail the subfigure or subtable,"
+    " not the entire figure or table."
+    " Do not mention other unrelated surrounding text."
+    "\n\n{context_text}Describe the media,"  # Allow for empty context_text
+    " or if uncertain on a description please state why:"
+)
diff --git a/src/paperqa/readers.py b/src/paperqa/readers.py
@@ -275,6 +275,8 @@ def chunk_code_text(
 
 
 IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"})
+# When HTML reader supports images, add here
+ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS})
 
 
 @overload
@@ -285,6 +287,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
+    multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
     parse_pdf: PDFParserFn | None = ...,
     **parser_kwargs,
 ) -> ParsedText: ...
@@ -296,6 +299,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
+    multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
     parse_pdf: PDFParserFn | None = ...,
     **parser_kwargs,
 ) -> ParsedText: ...
@@ -307,6 +311,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
+    multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
     parse_pdf: PDFParserFn | None = ...,
     **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
@@ -318,6 +323,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
+    multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
     parse_pdf: PDFParserFn | None = ...,
     **parser_kwargs,
 ) -> list[Text]: ...
@@ -329,6 +335,8 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
+    image_enrichment_pages: int | bool = ...,
+    multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
     parse_pdf: PDFParserFn | None = ...,
     **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
@@ -339,6 +347,7 @@ async def read_doc(  # noqa: PLR0912
     include_metadata: bool = False,
     chunk_chars: int = 3000,
     overlap: int = 100,
+    multimodal_enricher: Callable[[ParsedText], Awaitable[str]] | None = None,
     parse_pdf: PDFParserFn | None = None,
     **parser_kwargs,
 ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
@@ -351,6 +360,8 @@ async def read_doc(  # noqa: PLR0912
         include_metadata: Opt-in flag to include metadata about the chunking algorithm.
         chunk_chars: size of chunks
         overlap: size of overlap between chunks
+        multimodal_enricher: Optional function to enrich the parsed text
+            and return a hashable string summary before chunking.
         parse_pdf: Optional function to parse PDF files (if you're parsing a PDF).
         parser_kwargs: Keyword arguments to pass to the used parsing function.
     """
@@ -380,6 +391,13 @@ async def read_doc(  # noqa: PLR0912
     if parsed_text_only:
         return parsed_text
 
+    # Enrich upon full parsed text before chunking, since enrichment
+    # may view adjacent pages (and not getting cut off on chunk boundaries)
+    if str_path.endswith(ENRICHMENT_EXTENSIONS) and multimodal_enricher:
+        enrichment_summary: str = f"|{await multimodal_enricher(parsed_text)}"
+    else:
+        enrichment_summary = ""
+
     # next chunk the parsed text
 
     if chunk_chars == 0:
@@ -389,7 +407,10 @@ async def read_doc(  # noqa: PLR0912
         chunk_metadata = ChunkMetadata(
             size=0,
             overlap=0,
-            name=f"paper-qa={pqa_version}|algorithm=none|reduction=cl100k_base",
+            name=(
+                f"paper-qa={pqa_version}|algorithm=none"
+                f"|reduction=cl100k_base{enrichment_summary}"
+            ),
         )
     elif str_path.endswith(".pdf"):
         chunked_text = chunk_pdf(
@@ -400,7 +421,7 @@ async def read_doc(  # noqa: PLR0912
             overlap=overlap,
             name=(
                 f"paper-qa={pqa_version}|algorithm=overlap-pdf"
-                f"|size={chunk_chars}|overlap={overlap}"
+                f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
             ),
         )
     elif str_path.endswith(IMAGE_EXTENSIONS):
@@ -410,7 +431,7 @@ async def read_doc(  # noqa: PLR0912
         chunk_metadata = ChunkMetadata(
             size=0,
             overlap=0,
-            name=f"paper-qa={pqa_version}|algorithm=none",
+            name=f"paper-qa={pqa_version}|algorithm=none{enrichment_summary}",
         )
     elif str_path.endswith((".txt", ".html")):
         chunked_text = chunk_text(
@@ -421,7 +442,7 @@ async def read_doc(  # noqa: PLR0912
             overlap=overlap,
             name=(
                 f"paper-qa={pqa_version}|algorithm=overlap-text|reduction=cl100k_base"
-                f"|size={chunk_chars}|overlap={overlap}"
+                f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
             ),
         )
     else:
@@ -433,7 +454,7 @@ async def read_doc(  # noqa: PLR0912
             overlap=overlap,
             name=(
                 f"paper-qa={pqa_version}|algorithm=overlap-code|reduction=cl100k_base"
-                f"|size={chunk_chars}|overlap={overlap}"
+                f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
             ),
         )