Create reach_comparison.ipynb

gyorilab · Sep 29, 2023 · 339b6d3 · 339b6d3
1 parent c2efdf1
commit 339b6d3
Showing 1 changed file with 265 additions and 0 deletions.
diff --git a/notebooks/reach_comparison.ipynb b/notebooks/reach_comparison.ipynb
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d8db752d-8b39-4058-a11f-0fb215a728e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warming up dockerized REACH at http://localhost:8080/api/text\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - Could not connect to REACH service:\n",
+      "ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "finished warming up\n"
+     ]
+    }
+   ],
+   "source": [
+    "from indra.sources import reach\n",
+    "from indra.literature import pubmed_client\n",
+    "from typing import NamedTuple\n",
+    "import gilda\n",
+    "\n",
+    "# See https://indra.readthedocs.io/en/latest/modules/sources/reach/index.html for setting up REACH locally\n",
+    "print(f\"Warming up dockerized REACH at {reach.local_text_url}\")\n",
+    "reach_processor = reach.process_text(\"this is about a vaccine\", url=reach.local_text_url)\n",
+    "print(\"finished warming up\")\n",
+    "reach_processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5d1cabbd-8622-4c47-ba33-9b0a1e10ceae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "using reach at http://localhost:8080/api/text\n"
+     ]
+    }
+   ],
+   "source": [
+    "class ReachEntity(NamedTuple):\n",
+    "    text: str\n",
+    "    start: int\n",
+    "    end: int\n",
+    "\n",
+    "print(f\"using reach at {reach.local_text_url}\")\n",
+    "\n",
+    "def get_reach_entities(text: str, pubmed=None) -> list[ReachEntity]:\n",
+    "    reach_processor = reach.process_text(\n",
+    "        text, \n",
+    "        citation=pubmed, \n",
+    "        # url=reach.local_text_url,\n",
+    "    )\n",
+    "    if reach_processor is None:\n",
+    "        raise ValueError(\"reach.process_text() returned None\")\n",
+    "    return [\n",
+    "        ReachEntity(\n",
+    "            text=data[\"text\"], \n",
+    "            start=data[\"start-pos\"][\"offset\"], \n",
+    "            end=data[\"end-pos\"][\"offset\"],\n",
+    "        )\n",
+    "        for data in reach_processor.get_all_entities()\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0706c888-f1bf-49d3-a374-5549501ac32d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pubmed_id = \"37192450\"\n",
+    "abstract = pubmed_client.get_abstract(pubmed_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d80b76c0-1780-44b2-991b-4a1e91430751",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[ReachEntity(text='COVID-19 vaccines', start=108, end=125),\n",
+       " ReachEntity(text='SARS-CoV-2 polymerase', start=646, end=667),\n",
+       " ReachEntity(text='test', start=683, end=687),\n",
+       " ReachEntity(text='MV', start=954, end=956),\n",
+       " ReachEntity(text='duration', start=958, end=966),\n",
+       " ReachEntity(text='MV', start=1218, end=1220),\n",
+       " ReachEntity(text='membrane', start=1292, end=1300),\n",
+       " ReachEntity(text='duration', start=1477, end=1485),\n",
+       " ReachEntity(text='ICU LOS', start=1519, end=1526),\n",
+       " ReachEntity(text='MV', start=1486, end=1488),\n",
+       " ReachEntity(text='CI', start=1450, end=1452),\n",
+       " ReachEntity(text='duration', start=1775, end=1783),\n",
+       " ReachEntity(text='CI', start=1754, end=1756),\n",
+       " ReachEntity(text='MV', start=1787, end=1789),\n",
+       " ReachEntity(text='ICU', start=2176, end=2179)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_reach_entities(abstract, pubmed_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c652492c-9417-41e7-8e85-a9c3ebe39b46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: [2023-09-29 12:52:14] numexpr.utils - Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
+      "INFO: [2023-09-29 12:52:14] numexpr.utils - NumExpr defaulting to 8 threads.\n",
+      "WARNING: [2023-09-29 12:52:16] indra.literature.pubmed_client - Not all ids were retrieved for search vaccine[tw];\n",
+      "limited at 10000.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kestrel.sources.literature.utils import get_pubmed_dataframe\n",
+    "\n",
+    "pmids = pubmed_client.get_ids(\"vaccine\")\n",
+    "df = get_pubmed_dataframe(pmids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "85066402-30b8-4534-96ba-761e602a7947",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>phrase</th>\n",
+       "      <th>frequency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Phase 1b trial</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Sm-TSP-2 Vaccine</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Enugu</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             phrase  frequency\n",
+       "0    Phase 1b trial          1\n",
+       "1  Sm-TSP-2 Vaccine          1\n",
+       "2             Enugu          1"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def find_gilda_missing(text: str, *, grounder: gilda.Grounder | None = None, pubmed=None):\n",
+    "    if grounder is None:\n",
+    "        grounder = gilda.get_grounder()\n",
+    "    entities = get_reach_entities(text, pubmed=pubmed)\n",
+    "    return [\n",
+    "        t\n",
+    "        for t in entities\n",
+    "        if not grounder.ground(t.text)\n",
+    "    ]\n",
+    "\n",
+    "from collections import Counter\n",
+    "\n",
+    "#dd = defaultdict(lambda: defaultdict(set))\n",
+    "dd = Counter()\n",
+    "for pubmed, row in df.head(5).iterrows():\n",
+    "    for t in find_gilda_missing(row[\"title\"], pubmed=pubmed):\n",
+    "        dd[t.text] += 1\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "results_df = pd.DataFrame(dd.most_common(), columns=[\"phrase\", \"frequency\"])\n",
+    "results_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}