-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
265 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,265 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"id": "d8db752d-8b39-4058-a11f-0fb215a728e3", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Warming up dockerized REACH at http://localhost:8080/api/text\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - Could not connect to REACH service:\n", | ||
"ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"finished warming up\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from indra.sources import reach\n", | ||
"from indra.literature import pubmed_client\n", | ||
"from typing import NamedTuple\n", | ||
"import gilda\n", | ||
"\n", | ||
"# See https://indra.readthedocs.io/en/latest/modules/sources/reach/index.html for setting up REACH locally\n", | ||
"print(f\"Warming up dockerized REACH at {reach.local_text_url}\")\n", | ||
"reach_processor = reach.process_text(\"this is about a vaccine\", url=reach.local_text_url)\n", | ||
"print(\"finished warming up\")\n", | ||
"reach_processor" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "5d1cabbd-8622-4c47-ba33-9b0a1e10ceae", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"using reach at http://localhost:8080/api/text\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"class ReachEntity(NamedTuple):\n", | ||
" text: str\n", | ||
" start: int\n", | ||
" end: int\n", | ||
"\n", | ||
"print(f\"using reach at {reach.local_text_url}\")\n", | ||
"\n", | ||
"def get_reach_entities(text: str, pubmed=None) -> list[ReachEntity]:\n", | ||
" reach_processor = reach.process_text(\n", | ||
" text, \n", | ||
" citation=pubmed, \n", | ||
" # url=reach.local_text_url,\n", | ||
" )\n", | ||
" if reach_processor is None:\n", | ||
" raise ValueError(\"reach.process_text() returned None\")\n", | ||
" return [\n", | ||
" ReachEntity(\n", | ||
" text=data[\"text\"], \n", | ||
" start=data[\"start-pos\"][\"offset\"], \n", | ||
" end=data[\"end-pos\"][\"offset\"],\n", | ||
" )\n", | ||
" for data in reach_processor.get_all_entities()\n", | ||
" ]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "0706c888-f1bf-49d3-a374-5549501ac32d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pubmed_id = \"37192450\"\n", | ||
"abstract = pubmed_client.get_abstract(pubmed_id)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "d80b76c0-1780-44b2-991b-4a1e91430751", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[ReachEntity(text='COVID-19 vaccines', start=108, end=125),\n", | ||
" ReachEntity(text='SARS-CoV-2 polymerase', start=646, end=667),\n", | ||
" ReachEntity(text='test', start=683, end=687),\n", | ||
" ReachEntity(text='MV', start=954, end=956),\n", | ||
" ReachEntity(text='duration', start=958, end=966),\n", | ||
" ReachEntity(text='MV', start=1218, end=1220),\n", | ||
" ReachEntity(text='membrane', start=1292, end=1300),\n", | ||
" ReachEntity(text='duration', start=1477, end=1485),\n", | ||
" ReachEntity(text='ICU LOS', start=1519, end=1526),\n", | ||
" ReachEntity(text='MV', start=1486, end=1488),\n", | ||
" ReachEntity(text='CI', start=1450, end=1452),\n", | ||
" ReachEntity(text='duration', start=1775, end=1783),\n", | ||
" ReachEntity(text='CI', start=1754, end=1756),\n", | ||
" ReachEntity(text='MV', start=1787, end=1789),\n", | ||
" ReachEntity(text='ICU', start=2176, end=2179)]" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"get_reach_entities(abstract, pubmed_id)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "c652492c-9417-41e7-8e85-a9c3ebe39b46", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"INFO: [2023-09-29 12:52:14] numexpr.utils - Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", | ||
"INFO: [2023-09-29 12:52:14] numexpr.utils - NumExpr defaulting to 8 threads.\n", | ||
"WARNING: [2023-09-29 12:52:16] indra.literature.pubmed_client - Not all ids were retrieved for search vaccine[tw];\n", | ||
"limited at 10000.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from kestrel.sources.literature.utils import get_pubmed_dataframe\n", | ||
"\n", | ||
"pmids = pubmed_client.get_ids(\"vaccine\")\n", | ||
"df = get_pubmed_dataframe(pmids)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "85066402-30b8-4534-96ba-761e602a7947", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>phrase</th>\n", | ||
" <th>frequency</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>Phase 1b trial</td>\n", | ||
" <td>1</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>Sm-TSP-2 Vaccine</td>\n", | ||
" <td>1</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>Enugu</td>\n", | ||
" <td>1</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" phrase frequency\n", | ||
"0 Phase 1b trial 1\n", | ||
"1 Sm-TSP-2 Vaccine 1\n", | ||
"2 Enugu 1" | ||
] | ||
}, | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"def find_gilda_missing(text: str, *, grounder: gilda.Grounder | None = None, pubmed=None):\n", | ||
" if grounder is None:\n", | ||
" grounder = gilda.get_grounder()\n", | ||
" entities = get_reach_entities(text, pubmed=pubmed)\n", | ||
" return [\n", | ||
" t\n", | ||
" for t in entities\n", | ||
" if not grounder.ground(t.text)\n", | ||
" ]\n", | ||
"\n", | ||
"from collections import Counter\n", | ||
"\n", | ||
"#dd = defaultdict(lambda: defaultdict(set))\n", | ||
"dd = Counter()\n", | ||
"for pubmed, row in df.head(5).iterrows():\n", | ||
" for t in find_gilda_missing(row[\"title\"], pubmed=pubmed):\n", | ||
" dd[t.text] += 1\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"results_df = pd.DataFrame(dd.most_common(), columns=[\"phrase\", \"frequency\"])\n", | ||
"results_df" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |