diff --git a/README.md b/README.md index b331929f..5c2adadc 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ question answering, summarization, and contradiction detection. - [Using External DB/Vector DB and Caching](#using-external-dbvector-db-and-caching) - [Reusing Index](#reusing-index) - [Running on LitQA v2](#running-on-litqa-v2) + - [Using Clients Directly](#using-clients-directly) - [Where do I get papers?](#where-do-i-get-papers) - [Zotero](#zotero) - [Paper Scraper](#paper-scraper) @@ -603,6 +604,47 @@ async def evaluate(folder_of_litqa_v2_papers: str | os.PathLike) -> None: print(metrics_callback.eval_means) ``` +### Using Clients Directly + +One of the most powerful features of PaperQA2 is its ability to combine data from multiple metadata sources. For example, [Unpaywall](https://unpaywall.org/) can provide open access status/direct links to PDFs, [Crossref](https://www.crossref.org/) can provide bibtex, and [Semantic Scholar](https://www.semanticscholar.org/) can provide citation licenses. Here's a short demo of how to do this: + +```python +from paperqa.clients import DocMetadataClient, ALL_CLIENTS + +client = DocMetadataClient(clients=ALL_CLIENTS) +details = await client.query(title="Augmenting language models with chemistry tools") + +print(details.formatted_citation) +# Andres M. Bran, Sam Cox, Oliver Schilter, Carlo Baldassari, Andrew D. White, and Philippe Schwaller. +# Augmenting large language models with chemistry tools. Nature Machine Intelligence, +# 6:525-535, May 2024. URL: https://doi.org/10.1038/s42256-024-00832-8, +# doi:10.1038/s42256-024-00832-8. +# This article has 243 citations and is from a domain leading peer-reviewed journal. + +print(details.citation_count) +# 243 + +print(details.license) +# cc-by + +print(details.pdf_url) +# https://www.nature.com/articles/s42256-024-00832-8.pdf +``` + +the `client.query` is meant to check for exact matches of title. It's a bit robust (like to casing, missing a word). There are duplicates for titles though - so you can also add authors to disambiguate. Or you can provide a doi directly `client.query(doi="10.1038/s42256-024-00832-8")`. + +If you're doing this at a large scale, you may not want to use `ALL_CLIENTS` (just omit the argument) and you can specify which specific fields you want to speed up queries. For example: + +```python +details = await client.query( + title="Augmenting large language models with chemistry tools", + authors=["Andres M. Bran", "Sam Cox"], + fields=["title", "doi"], +) +``` + +will return much faster than the first query and we'll be certain the authors match. + ## Where do I get papers? Well that's a really good question! It's probably best to just download PDFs of papers you think will help answer your question and start from there. diff --git a/paperqa/clients/unpaywall.py b/paperqa/clients/unpaywall.py index 297cd0bf..beb1f406 100644 --- a/paperqa/clients/unpaywall.py +++ b/paperqa/clients/unpaywall.py @@ -158,6 +158,12 @@ async def search_by_title( return details def _create_doc_details(self, data: UnpaywallResponse) -> DocDetails: + # extract pdf location if present + pdf_url: str | None = None + license: str | None = None # noqa: A001 + if data.best_oa_location: + pdf_url = data.best_oa_location.url_for_pdf + license = data.best_oa_location.license # noqa: A001 return DocDetails( # type: ignore[call-arg] authors=[ f"{author.given} {author.family}" for author in (data.z_authors or []) @@ -174,6 +180,8 @@ def _create_doc_details(self, data: UnpaywallResponse) -> DocDetails: title=data.title, doi=data.doi, doi_url=data.doi_url, + license=license, + pdf_url=pdf_url, other={ "genre": data.genre, "is_paratext": data.is_paratext, diff --git a/paperqa/types.py b/paperqa/types.py index 43ce8106..7d3a5ffc 100644 --- a/paperqa/types.py +++ b/paperqa/types.py @@ -334,6 +334,12 @@ class DocDetails(Doc): doi_url: str | None = None doc_id: str | None = None file_location: str | os.PathLike | None = None + license: str | None = Field( + default=None, + description="string indicating license." + " Should refer specifically to pdf_url (since that could be preprint). None means unknown/unset.", + ) + pdf_url: str | None = None other: dict[str, Any] = Field( default_factory=dict, description="Other metadata besides the above standardized fields.", @@ -570,17 +576,21 @@ def formatted_citation(self) -> str: " to call `hydrate`?" ) - quality = ( - SOURCE_QUALITY_MESSAGES[self.source_quality] - if self.source_quality >= 0 - else None - ) + if self.source_quality_message: + return ( + f"{self.citation} This article has {self.citation_count} citations and is" + f" from a {self.source_quality_message}." + ) + return f"{self.citation} This article has {self.citation_count} citations." - if quality is None: - return f"{self.citation} This article has {self.citation_count} citations." + @property + def source_quality_message(self) -> str: return ( - f"{self.citation} This article has {self.citation_count} citations and is" - f" from a {quality}." + SOURCE_QUALITY_MESSAGES[self.source_quality] + if self.source_quality is not None + and self.source_quality + != DocDetails.UNDEFINED_JOURNAL_QUALITY # note - zero is a valid value + else "" ) OPTIONAL_HYDRATION_FIELDS: ClassVar[Collection[str]] = {"url"}