Skip to content

Commit

Permalink
Create reach_comparison.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Sep 29, 2023
1 parent c2efdf1 commit 339b6d3
Showing 1 changed file with 265 additions and 0 deletions.
265 changes: 265 additions & 0 deletions notebooks/reach_comparison.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"id": "d8db752d-8b39-4058-a11f-0fb215a728e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warming up dockerized REACH at http://localhost:8080/api/text\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - Could not connect to REACH service:\n",
"ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"finished warming up\n"
]
}
],
"source": [
"from indra.sources import reach\n",
"from indra.literature import pubmed_client\n",
"from typing import NamedTuple\n",
"import gilda\n",
"\n",
"# See https://indra.readthedocs.io/en/latest/modules/sources/reach/index.html for setting up REACH locally\n",
"print(f\"Warming up dockerized REACH at {reach.local_text_url}\")\n",
"reach_processor = reach.process_text(\"this is about a vaccine\", url=reach.local_text_url)\n",
"print(\"finished warming up\")\n",
"reach_processor"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5d1cabbd-8622-4c47-ba33-9b0a1e10ceae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"using reach at http://localhost:8080/api/text\n"
]
}
],
"source": [
"class ReachEntity(NamedTuple):\n",
" text: str\n",
" start: int\n",
" end: int\n",
"\n",
"print(f\"using reach at {reach.local_text_url}\")\n",
"\n",
"def get_reach_entities(text: str, pubmed=None) -> list[ReachEntity]:\n",
" reach_processor = reach.process_text(\n",
" text, \n",
" citation=pubmed, \n",
" # url=reach.local_text_url,\n",
" )\n",
" if reach_processor is None:\n",
" raise ValueError(\"reach.process_text() returned None\")\n",
" return [\n",
" ReachEntity(\n",
" text=data[\"text\"], \n",
" start=data[\"start-pos\"][\"offset\"], \n",
" end=data[\"end-pos\"][\"offset\"],\n",
" )\n",
" for data in reach_processor.get_all_entities()\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0706c888-f1bf-49d3-a374-5549501ac32d",
"metadata": {},
"outputs": [],
"source": [
"pubmed_id = \"37192450\"\n",
"abstract = pubmed_client.get_abstract(pubmed_id)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d80b76c0-1780-44b2-991b-4a1e91430751",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[ReachEntity(text='COVID-19 vaccines', start=108, end=125),\n",
" ReachEntity(text='SARS-CoV-2 polymerase', start=646, end=667),\n",
" ReachEntity(text='test', start=683, end=687),\n",
" ReachEntity(text='MV', start=954, end=956),\n",
" ReachEntity(text='duration', start=958, end=966),\n",
" ReachEntity(text='MV', start=1218, end=1220),\n",
" ReachEntity(text='membrane', start=1292, end=1300),\n",
" ReachEntity(text='duration', start=1477, end=1485),\n",
" ReachEntity(text='ICU LOS', start=1519, end=1526),\n",
" ReachEntity(text='MV', start=1486, end=1488),\n",
" ReachEntity(text='CI', start=1450, end=1452),\n",
" ReachEntity(text='duration', start=1775, end=1783),\n",
" ReachEntity(text='CI', start=1754, end=1756),\n",
" ReachEntity(text='MV', start=1787, end=1789),\n",
" ReachEntity(text='ICU', start=2176, end=2179)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_reach_entities(abstract, pubmed_id)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c652492c-9417-41e7-8e85-a9c3ebe39b46",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO: [2023-09-29 12:52:14] numexpr.utils - Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
"INFO: [2023-09-29 12:52:14] numexpr.utils - NumExpr defaulting to 8 threads.\n",
"WARNING: [2023-09-29 12:52:16] indra.literature.pubmed_client - Not all ids were retrieved for search vaccine[tw];\n",
"limited at 10000.\n"
]
}
],
"source": [
"from kestrel.sources.literature.utils import get_pubmed_dataframe\n",
"\n",
"pmids = pubmed_client.get_ids(\"vaccine\")\n",
"df = get_pubmed_dataframe(pmids)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "85066402-30b8-4534-96ba-761e602a7947",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>phrase</th>\n",
" <th>frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Phase 1b trial</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Sm-TSP-2 Vaccine</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Enugu</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" phrase frequency\n",
"0 Phase 1b trial 1\n",
"1 Sm-TSP-2 Vaccine 1\n",
"2 Enugu 1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def find_gilda_missing(text: str, *, grounder: gilda.Grounder | None = None, pubmed=None):\n",
" if grounder is None:\n",
" grounder = gilda.get_grounder()\n",
" entities = get_reach_entities(text, pubmed=pubmed)\n",
" return [\n",
" t\n",
" for t in entities\n",
" if not grounder.ground(t.text)\n",
" ]\n",
"\n",
"from collections import Counter\n",
"\n",
"#dd = defaultdict(lambda: defaultdict(set))\n",
"dd = Counter()\n",
"for pubmed, row in df.head(5).iterrows():\n",
" for t in find_gilda_missing(row[\"title\"], pubmed=pubmed):\n",
" dd[t.text] += 1\n",
"\n",
"import pandas as pd\n",
"\n",
"results_df = pd.DataFrame(dd.most_common(), columns=[\"phrase\", \"frequency\"])\n",
"results_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 339b6d3

Please sign in to comment.