From b8316899f4b9340683cf7c98c5338ddd7dc09821 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 23 Jul 2024 22:43:36 -0400 Subject: [PATCH 1/6] Use absolute imports --- gilda/generate_terms.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py index 9e5fcf6..632c50a 100644 --- a/gilda/generate_terms.py +++ b/gilda/generate_terms.py @@ -12,9 +12,9 @@ from indra.databases import hgnc_client, uniprot_client, chebi_client, \ go_client, mesh_client, doid_client from indra.statements.resources import amino_acids -from .term import Term, dump_terms, filter_out_duplicates -from .process import normalize -from .resources import resource_dir, popular_organisms +from gilda.term import Term, dump_terms, filter_out_duplicates +from gilda.process import normalize +from gilda.resources import resource_dir, popular_organisms indra_module_path = indra.__path__[0] @@ -666,7 +666,7 @@ def _generate_obo_terms(prefix, ignore_mappings=False, map_to_ns=None): def _make_mesh_mappings(): # Load MeSH ID/label mappings - from .resources import MESH_MAPPINGS_PATH + from gilda.resources import MESH_MAPPINGS_PATH mesh_mappings = {} mesh_mappings_reverse = {} for row in read_csv(MESH_MAPPINGS_PATH, delimiter='\t'): @@ -715,7 +715,7 @@ def get_all_terms(): def main(): - from .resources import GROUNDING_TERMS_PATH as fname + from gilda.resources import GROUNDING_TERMS_PATH as fname terms = get_all_terms() dump_terms(terms, fname) From d6bf9350510b42ab7f74d1de34d559a70425d575 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 23 Jul 2024 22:43:48 -0400 Subject: [PATCH 2/6] Handle punctuation during annotation --- gilda/ner.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index 0b7881c..843f58e 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -47,8 +47,10 @@ from typing import List +import nltk from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize +from nltk.tokenize import TreebankWordTokenizer from gilda import get_grounder from gilda.grounder import Annotation @@ -109,11 +111,11 @@ def annotate( sentences = sent_split_fun(text) text_coord = 0 annotations = [] + word_tokenizer = TreebankWordTokenizer() for sentence in sentences: - raw_words = [w for w in sentence.rstrip('.').split()] - word_coords = [text_coord] - for word in raw_words: - word_coords.append(word_coords[-1] + len(word) + 1) + raw_word_coords = \ + list(word_tokenizer.span_tokenize(sentence.rstrip('.'))) + raw_words = [sentence[start:end] for start, end in raw_word_coords] text_coord += len(sentence) + 1 words = [normalize(w) for w in raw_words] skip_until = 0 @@ -132,17 +134,26 @@ def annotate( # Find the largest matching span for span in sorted(applicable_spans, reverse=True): - txt_span = ' '.join(raw_words[idx:idx+span]) + # We have to reconstruct a text span while adding spaces + # where needed + txt_span = '' + raw_span = '' + for w, rw, c in zip(words[idx:idx+span], + raw_words[idx:idx+span], + raw_word_coords[idx:idx+span]): + # Figure out if we need a space before this word, then + # append the word. + spaces = ' ' * (c[0] - len(raw_span)) + txt_span += spaces + w + raw_span += spaces + rw context = text if context_text is None else context_text matches = grounder.ground(txt_span, context=context, organisms=organisms, namespaces=namespaces) if matches: - start_coord = word_coords[idx] - end_coord = word_coords[idx+span-1] + \ - len(raw_words[idx+span-1]) - raw_span = ' '.join(raw_words[idx:idx+span]) + start_coord = raw_word_coords[idx][0] + end_coord = raw_word_coords[idx+span-1][1] annotations.append(Annotation( raw_span, matches, start_coord, end_coord )) From abed6195a6d9ef99eb2204a4865ec7e8947767b9 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 23 Jul 2024 23:20:23 -0400 Subject: [PATCH 3/6] Use raw word offset in choosing spaces --- gilda/ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gilda/ner.py b/gilda/ner.py index 843f58e..bc0a25b 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -143,7 +143,7 @@ def annotate( raw_word_coords[idx:idx+span]): # Figure out if we need a space before this word, then # append the word. - spaces = ' ' * (c[0] - len(raw_span)) + spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0]) txt_span += spaces + w raw_span += spaces + rw context = text if context_text is None else context_text From c4719354d9d923f6bccec4de4ed2f29da0ed3358 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 23 Jul 2024 23:20:36 -0400 Subject: [PATCH 4/6] Add tests for NER punctuation --- gilda/tests/test_ner.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index 9958fae..6c54758 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -86,3 +86,28 @@ def test_context_test(): assert results[0].matches[0].term.get_curie() == "GO:0005783" assert results[0].text == "ER" assert (results[0].start, results[0].end) == (14, 16) + + +def test_punctuation_comma_in_entity(): + # A named entity with an actual comma in its name + res = gilda.annotate('access, internet') + assert len(res) == 1 + # Make sure we capture the text span exactly despite + # tokenization + assert res[0].text == 'access, internet' + assert res[0].start == 0 + assert res[0].end == 16 + assert res[0].matches[0].term.db == 'MESH' + assert res[0].matches[0].term.id == 'D000077230' + + +def test_punctuation_outside_entities(): + res = gilda.annotate('EGF binds EGFR, which is a receptor.') + assert len(res) == 3 + + assert [ann.text for ann in res] == ['EGF', 'EGFR', 'receptor'] + + res = gilda.annotate('EGF binds EGFR: a receptor.') + assert len(res) == 3 + + assert [ann.text for ann in res] == ['EGF', 'EGFR', 'receptor'] From 72fe19c7a7fb3735c825c77948a265a5225ae383 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Wed, 24 Jul 2024 00:11:50 -0400 Subject: [PATCH 5/6] Implement sentence offset tracking with spaces --- gilda/ner.py | 22 ++++++++++++++-------- gilda/tests/test_ner.py | 6 +++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index bc0a25b..d904734 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -47,10 +47,8 @@ from typing import List -import nltk from nltk.corpus import stopwords -from nltk.tokenize import sent_tokenize -from nltk.tokenize import TreebankWordTokenizer +from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer from gilda import get_grounder from gilda.grounder import Annotation @@ -105,14 +103,20 @@ def annotate( """ if grounder is None: grounder = get_grounder() + sent_tokenizer = PunktSentenceTokenizer() if sent_split_fun is None: - sent_split_fun = sent_tokenize + sent_split_fun = sent_tokenizer.tokenize # Get sentences sentences = sent_split_fun(text) + sentence_coords = list(sent_tokenizer.span_tokenize(text)) text_coord = 0 annotations = [] word_tokenizer = TreebankWordTokenizer() - for sentence in sentences: + # FIXME: a custom sentence split function can be inconsistent + # with the coordinates being used here which come from NLTK + for sentence, sentence_coord in zip(sentences, sentence_coords): + # FIXME: one rare corner case is named entities with single quotes + # in them which get tokenized in a weird way raw_word_coords = \ list(word_tokenizer.span_tokenize(sentence.rstrip('.'))) raw_words = [sentence[start:end] for start, end in raw_word_coords] @@ -143,7 +147,8 @@ def annotate( raw_word_coords[idx:idx+span]): # Figure out if we need a space before this word, then # append the word. - spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0]) + spaces = ' ' * (c[0] - len(raw_span) - + raw_word_coords[idx][0]) txt_span += spaces + w raw_span += spaces + rw context = text if context_text is None else context_text @@ -152,8 +157,9 @@ def annotate( organisms=organisms, namespaces=namespaces) if matches: - start_coord = raw_word_coords[idx][0] - end_coord = raw_word_coords[idx+span-1][1] + start_coord = sentence_coord[0] + raw_word_coords[idx][0] + end_coord = sentence_coord[0] + \ + raw_word_coords[idx+span-1][1] annotations.append(Annotation( raw_span, matches, start_coord, end_coord )) diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index 6c54758..c8b24c1 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -83,9 +83,9 @@ def test_context_test(): context_text = "Calcium is released from the ER." results = gilda.annotate(text, context_text=context_text) assert len(results) == 1 - assert results[0].matches[0].term.get_curie() == "GO:0005783" - assert results[0].text == "ER" - assert (results[0].start, results[0].end) == (14, 16) + assert results[1].matches[0].term.get_curie() == "GO:0005783" + assert results[1].text == "ER" + assert (results[1].start, results[0].end) == (14, 16) def test_punctuation_comma_in_entity(): From cfc0dcd90038f4393b70d1a12eeaaaa512324b04 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Wed, 24 Jul 2024 00:31:10 -0400 Subject: [PATCH 6/6] Fix the role of raw vs processed words --- gilda/ner.py | 7 ++----- gilda/tests/test_ner.py | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index d904734..6feb574 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -140,19 +140,16 @@ def annotate( for span in sorted(applicable_spans, reverse=True): # We have to reconstruct a text span while adding spaces # where needed - txt_span = '' raw_span = '' - for w, rw, c in zip(words[idx:idx+span], - raw_words[idx:idx+span], + for rw, c in zip(raw_words[idx:idx+span], raw_word_coords[idx:idx+span]): # Figure out if we need a space before this word, then # append the word. spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0]) - txt_span += spaces + w raw_span += spaces + rw context = text if context_text is None else context_text - matches = grounder.ground(txt_span, + matches = grounder.ground(raw_span, context=context, organisms=organisms, namespaces=namespaces) diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index c8b24c1..6c54758 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -83,9 +83,9 @@ def test_context_test(): context_text = "Calcium is released from the ER." results = gilda.annotate(text, context_text=context_text) assert len(results) == 1 - assert results[1].matches[0].term.get_curie() == "GO:0005783" - assert results[1].text == "ER" - assert (results[1].start, results[0].end) == (14, 16) + assert results[0].matches[0].term.get_curie() == "GO:0005783" + assert results[0].text == "ER" + assert (results[0].start, results[0].end) == (14, 16) def test_punctuation_comma_in_entity():