Skip to content

Commit

Permalink
More robust JSON parsing (#270)
Browse files Browse the repository at this point in the history
* More robust JSON parsing

* Added missing dev dependency

* Update tests/test_paperqa.py

Co-authored-by: James Braza <[email protected]>

* Switched to pytest raise values

* Fixed more unit tests from PR review

---------

Co-authored-by: James Braza <[email protected]>
  • Loading branch information
whitead and jamesbraza authored Apr 14, 2024
1 parent 7b9c201 commit c4957cb
Show file tree
Hide file tree
Showing 5 changed files with 248 additions and 210 deletions.
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ pre-commit
pytest
types-requests
types-setuptools
pytest-asyncio
pytest-sugar
pytest-timer
5 changes: 2 additions & 3 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
gather_with_concurrency,
get_loop,
guess_is_4xx,
llm_read_json,
maybe_is_html,
maybe_is_pdf,
maybe_is_text,
Expand Down Expand Up @@ -657,9 +658,7 @@ async def process(match): # noqa: C901, PLR0912
success = True
if self.prompts.summary_json:
try:
# fetch from markdown ```json if present
context = context.split("```json")[-1].split("```")[0]
result_data = json.loads(context)
result_data = llm_read_json(context)
except json.decoder.JSONDecodeError:
# fallback to string
success = False
Expand Down
12 changes: 12 additions & 0 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import inspect
import json
import math
import re
import string
Expand Down Expand Up @@ -179,3 +180,14 @@ def is_coroutine_callable(obj):
elif callable(obj): # noqa: RET505
return inspect.iscoroutinefunction(obj.__call__)
return False


def llm_read_json(text: str) -> dict:
"""Read LLM output and extract JSON data from it."""
# fetch from markdown ```json if present
text = text.strip().split("```json")[-1].split("```")[0]
# split anything before the first {
text = "{" + text.split("{", 1)[-1]
# split anything after the last }
text = text.rsplit("}", 1)[0] + "}"
return json.loads(text)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ name = "paper-qa"
readme = "README.md"
requires-python = ">=3.8"
urls = {repository = "https://github.com/whitead/paper-qa"}
version = "4.5.0"
version = "4.5.1"

[tool.codespell]
check-filenames = true
Expand Down
Loading

0 comments on commit c4957cb

Please sign in to comment.