diff --git a/notebooks/reach_comparison.ipynb b/notebooks/reach_comparison.ipynb deleted file mode 100644 index 09f8147..0000000 --- a/notebooks/reach_comparison.ipynb +++ /dev/null @@ -1,265 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 12, - "id": "d8db752d-8b39-4058-a11f-0fb215a728e3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warming up dockerized REACH at http://localhost:8080/api/text\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - Could not connect to REACH service:\n", - "ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "finished warming up\n" - ] - } - ], - "source": [ - "from indra.sources import reach\n", - "from indra.literature import pubmed_client\n", - "from typing import NamedTuple\n", - "import gilda\n", - "\n", - "# See https://indra.readthedocs.io/en/latest/modules/sources/reach/index.html for setting up REACH locally\n", - "print(f\"Warming up dockerized REACH at {reach.local_text_url}\")\n", - "reach_processor = reach.process_text(\"this is about a vaccine\", url=reach.local_text_url)\n", - "print(\"finished warming up\")\n", - "reach_processor" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5d1cabbd-8622-4c47-ba33-9b0a1e10ceae", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "using reach at http://localhost:8080/api/text\n" - ] - } - ], - "source": [ - "class ReachEntity(NamedTuple):\n", - " text: str\n", - " start: int\n", - " end: int\n", - "\n", - "print(f\"using reach at {reach.local_text_url}\")\n", - "\n", - "def get_reach_entities(text: str, pubmed=None) -> list[ReachEntity]:\n", - " reach_processor = reach.process_text(\n", - " text, \n", - " citation=pubmed, \n", - " # url=reach.local_text_url,\n", - " )\n", - " if reach_processor is None:\n", - " raise ValueError(\"reach.process_text() returned None\")\n", - " return [\n", - " ReachEntity(\n", - " text=data[\"text\"], \n", - " start=data[\"start-pos\"][\"offset\"], \n", - " end=data[\"end-pos\"][\"offset\"],\n", - " )\n", - " for data in reach_processor.get_all_entities()\n", - " ]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0706c888-f1bf-49d3-a374-5549501ac32d", - "metadata": {}, - "outputs": [], - "source": [ - "pubmed_id = \"37192450\"\n", - "abstract = pubmed_client.get_abstract(pubmed_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d80b76c0-1780-44b2-991b-4a1e91430751", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[ReachEntity(text='COVID-19 vaccines', start=108, end=125),\n", - " ReachEntity(text='SARS-CoV-2 polymerase', start=646, end=667),\n", - " ReachEntity(text='test', start=683, end=687),\n", - " ReachEntity(text='MV', start=954, end=956),\n", - " ReachEntity(text='duration', start=958, end=966),\n", - " ReachEntity(text='MV', start=1218, end=1220),\n", - " ReachEntity(text='membrane', start=1292, end=1300),\n", - " ReachEntity(text='duration', start=1477, end=1485),\n", - " ReachEntity(text='ICU LOS', start=1519, end=1526),\n", - " ReachEntity(text='MV', start=1486, end=1488),\n", - " ReachEntity(text='CI', start=1450, end=1452),\n", - " ReachEntity(text='duration', start=1775, end=1783),\n", - " ReachEntity(text='CI', start=1754, end=1756),\n", - " ReachEntity(text='MV', start=1787, end=1789),\n", - " ReachEntity(text='ICU', start=2176, end=2179)]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_reach_entities(abstract, pubmed_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c652492c-9417-41e7-8e85-a9c3ebe39b46", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO: [2023-09-29 12:52:14] numexpr.utils - Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", - "INFO: [2023-09-29 12:52:14] numexpr.utils - NumExpr defaulting to 8 threads.\n", - "WARNING: [2023-09-29 12:52:16] indra.literature.pubmed_client - Not all ids were retrieved for search vaccine[tw];\n", - "limited at 10000.\n" - ] - } - ], - "source": [ - "from kestrel.sources.literature.utils import get_pubmed_dataframe\n", - "\n", - "pmids = pubmed_client.get_ids(\"vaccine\")\n", - "df = get_pubmed_dataframe(pmids)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "85066402-30b8-4534-96ba-761e602a7947", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
phrasefrequency
0Phase 1b trial1
1Sm-TSP-2 Vaccine1
2Enugu1
\n", - "
" - ], - "text/plain": [ - " phrase frequency\n", - "0 Phase 1b trial 1\n", - "1 Sm-TSP-2 Vaccine 1\n", - "2 Enugu 1" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def find_gilda_missing(text: str, *, grounder: gilda.Grounder | None = None, pubmed=None):\n", - " if grounder is None:\n", - " grounder = gilda.get_grounder()\n", - " entities = get_reach_entities(text, pubmed=pubmed)\n", - " return [\n", - " t\n", - " for t in entities\n", - " if not grounder.ground(t.text)\n", - " ]\n", - "\n", - "from collections import Counter\n", - "\n", - "#dd = defaultdict(lambda: defaultdict(set))\n", - "dd = Counter()\n", - "for pubmed, row in df.head(5).iterrows():\n", - " for t in find_gilda_missing(row[\"title\"], pubmed=pubmed):\n", - " dd[t.text] += 1\n", - "\n", - "import pandas as pd\n", - "\n", - "results_df = pd.DataFrame(dd.most_common(), columns=[\"phrase\", \"frequency\"])\n", - "results_df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}