From b8316899f4b9340683cf7c98c5338ddd7dc09821 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Tue, 23 Jul 2024 22:43:36 -0400
Subject: [PATCH 1/6] Use absolute imports

---
 gilda/generate_terms.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py
index 9e5fcf6..632c50a 100644
--- a/gilda/generate_terms.py
+++ b/gilda/generate_terms.py
@@ -12,9 +12,9 @@
 from indra.databases import hgnc_client, uniprot_client, chebi_client, \
     go_client, mesh_client, doid_client
 from indra.statements.resources import amino_acids
-from .term import Term, dump_terms, filter_out_duplicates
-from .process import normalize
-from .resources import resource_dir, popular_organisms
+from gilda.term import Term, dump_terms, filter_out_duplicates
+from gilda.process import normalize
+from gilda.resources import resource_dir, popular_organisms
 
 
 indra_module_path = indra.__path__[0]
@@ -666,7 +666,7 @@ def _generate_obo_terms(prefix, ignore_mappings=False, map_to_ns=None):
 
 def _make_mesh_mappings():
     # Load MeSH ID/label mappings
-    from .resources import MESH_MAPPINGS_PATH
+    from gilda.resources import MESH_MAPPINGS_PATH
     mesh_mappings = {}
     mesh_mappings_reverse = {}
     for row in read_csv(MESH_MAPPINGS_PATH, delimiter='\t'):
@@ -715,7 +715,7 @@ def get_all_terms():
 
 
 def main():
-    from .resources import GROUNDING_TERMS_PATH as fname
+    from gilda.resources import GROUNDING_TERMS_PATH as fname
     terms = get_all_terms()
     dump_terms(terms, fname)
 

From d6bf9350510b42ab7f74d1de34d559a70425d575 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Tue, 23 Jul 2024 22:43:48 -0400
Subject: [PATCH 2/6] Handle punctuation during annotation

---
 gilda/ner.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index 0b7881c..843f58e 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -47,8 +47,10 @@
 
 from typing import List
 
+import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
+from nltk.tokenize import TreebankWordTokenizer
 
 from gilda import get_grounder
 from gilda.grounder import Annotation
@@ -109,11 +111,11 @@ def annotate(
     sentences = sent_split_fun(text)
     text_coord = 0
     annotations = []
+    word_tokenizer = TreebankWordTokenizer()
     for sentence in sentences:
-        raw_words = [w for w in sentence.rstrip('.').split()]
-        word_coords = [text_coord]
-        for word in raw_words:
-            word_coords.append(word_coords[-1] + len(word) + 1)
+        raw_word_coords = \
+            list(word_tokenizer.span_tokenize(sentence.rstrip('.')))
+        raw_words = [sentence[start:end] for start, end in raw_word_coords]
         text_coord += len(sentence) + 1
         words = [normalize(w) for w in raw_words]
         skip_until = 0
@@ -132,17 +134,26 @@ def annotate(
 
             # Find the largest matching span
             for span in sorted(applicable_spans, reverse=True):
-                txt_span = ' '.join(raw_words[idx:idx+span])
+                # We have to reconstruct a text span while adding spaces
+                # where needed
+                txt_span = ''
+                raw_span = ''
+                for w, rw, c in zip(words[idx:idx+span],
+                                    raw_words[idx:idx+span],
+                                    raw_word_coords[idx:idx+span]):
+                    # Figure out if we need a space before this word, then
+                    # append the word.
+                    spaces = ' ' * (c[0] - len(raw_span))
+                    txt_span += spaces + w
+                    raw_span += spaces + rw
                 context = text if context_text is None else context_text
                 matches = grounder.ground(txt_span,
                                           context=context,
                                           organisms=organisms,
                                           namespaces=namespaces)
                 if matches:
-                    start_coord = word_coords[idx]
-                    end_coord = word_coords[idx+span-1] + \
-                        len(raw_words[idx+span-1])
-                    raw_span = ' '.join(raw_words[idx:idx+span])
+                    start_coord = raw_word_coords[idx][0]
+                    end_coord = raw_word_coords[idx+span-1][1]
                     annotations.append(Annotation(
                         raw_span, matches, start_coord, end_coord
                     ))

From abed6195a6d9ef99eb2204a4865ec7e8947767b9 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Tue, 23 Jul 2024 23:20:23 -0400
Subject: [PATCH 3/6] Use raw word offset in choosing spaces

---
 gilda/ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index 843f58e..bc0a25b 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -143,7 +143,7 @@ def annotate(
                                     raw_word_coords[idx:idx+span]):
                     # Figure out if we need a space before this word, then
                     # append the word.
-                    spaces = ' ' * (c[0] - len(raw_span))
+                    spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0])
                     txt_span += spaces + w
                     raw_span += spaces + rw
                 context = text if context_text is None else context_text

From c4719354d9d923f6bccec4de4ed2f29da0ed3358 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Tue, 23 Jul 2024 23:20:36 -0400
Subject: [PATCH 4/6] Add tests for NER punctuation

---
 gilda/tests/test_ner.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
index 9958fae..6c54758 100644
--- a/gilda/tests/test_ner.py
+++ b/gilda/tests/test_ner.py
@@ -86,3 +86,28 @@ def test_context_test():
     assert results[0].matches[0].term.get_curie() == "GO:0005783"
     assert results[0].text == "ER"
     assert (results[0].start, results[0].end) == (14, 16)
+
+
+def test_punctuation_comma_in_entity():
+    # A named entity with an actual comma in its name
+    res = gilda.annotate('access, internet')
+    assert len(res) == 1
+    # Make sure we capture the text span exactly despite
+    # tokenization
+    assert res[0].text == 'access, internet'
+    assert res[0].start == 0
+    assert res[0].end == 16
+    assert res[0].matches[0].term.db == 'MESH'
+    assert res[0].matches[0].term.id == 'D000077230'
+
+
+def test_punctuation_outside_entities():
+    res = gilda.annotate('EGF binds EGFR, which is a receptor.')
+    assert len(res) == 3
+
+    assert [ann.text for ann in res] == ['EGF', 'EGFR', 'receptor']
+
+    res = gilda.annotate('EGF binds EGFR: a receptor.')
+    assert len(res) == 3
+
+    assert [ann.text for ann in res] == ['EGF', 'EGFR', 'receptor']

From 72fe19c7a7fb3735c825c77948a265a5225ae383 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Wed, 24 Jul 2024 00:11:50 -0400
Subject: [PATCH 5/6] Implement sentence offset tracking with spaces

---
 gilda/ner.py            | 22 ++++++++++++++--------
 gilda/tests/test_ner.py |  6 +++---
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index bc0a25b..d904734 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -47,10 +47,8 @@
 
 from typing import List
 
-import nltk
 from nltk.corpus import stopwords
-from nltk.tokenize import sent_tokenize
-from nltk.tokenize import TreebankWordTokenizer
+from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer
 
 from gilda import get_grounder
 from gilda.grounder import Annotation
@@ -105,14 +103,20 @@ def annotate(
     """
     if grounder is None:
         grounder = get_grounder()
+    sent_tokenizer = PunktSentenceTokenizer()
     if sent_split_fun is None:
-        sent_split_fun = sent_tokenize
+        sent_split_fun = sent_tokenizer.tokenize
     # Get sentences
     sentences = sent_split_fun(text)
+    sentence_coords = list(sent_tokenizer.span_tokenize(text))
     text_coord = 0
     annotations = []
     word_tokenizer = TreebankWordTokenizer()
-    for sentence in sentences:
+    # FIXME: a custom sentence split function can be inconsistent
+    # with the coordinates being used here which come from NLTK
+    for sentence, sentence_coord in zip(sentences, sentence_coords):
+        # FIXME: one rare corner case is named entities with single quotes
+        # in them which get tokenized in a weird way
         raw_word_coords = \
             list(word_tokenizer.span_tokenize(sentence.rstrip('.')))
         raw_words = [sentence[start:end] for start, end in raw_word_coords]
@@ -143,7 +147,8 @@ def annotate(
                                     raw_word_coords[idx:idx+span]):
                     # Figure out if we need a space before this word, then
                     # append the word.
-                    spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0])
+                    spaces = ' ' * (c[0] - len(raw_span) -
+                                    raw_word_coords[idx][0])
                     txt_span += spaces + w
                     raw_span += spaces + rw
                 context = text if context_text is None else context_text
@@ -152,8 +157,9 @@ def annotate(
                                           organisms=organisms,
                                           namespaces=namespaces)
                 if matches:
-                    start_coord = raw_word_coords[idx][0]
-                    end_coord = raw_word_coords[idx+span-1][1]
+                    start_coord = sentence_coord[0] + raw_word_coords[idx][0]
+                    end_coord = sentence_coord[0] + \
+                        raw_word_coords[idx+span-1][1]
                     annotations.append(Annotation(
                         raw_span, matches, start_coord, end_coord
                     ))
diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
index 6c54758..c8b24c1 100644
--- a/gilda/tests/test_ner.py
+++ b/gilda/tests/test_ner.py
@@ -83,9 +83,9 @@ def test_context_test():
     context_text = "Calcium is released from the ER."
     results = gilda.annotate(text, context_text=context_text)
     assert len(results) == 1
-    assert results[0].matches[0].term.get_curie() == "GO:0005783"
-    assert results[0].text == "ER"
-    assert (results[0].start, results[0].end) == (14, 16)
+    assert results[1].matches[0].term.get_curie() == "GO:0005783"
+    assert results[1].text == "ER"
+    assert (results[1].start, results[0].end) == (14, 16)
 
 
 def test_punctuation_comma_in_entity():

From cfc0dcd90038f4393b70d1a12eeaaaa512324b04 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Wed, 24 Jul 2024 00:31:10 -0400
Subject: [PATCH 6/6] Fix the role of raw vs processed words

---
 gilda/ner.py            | 7 ++-----
 gilda/tests/test_ner.py | 6 +++---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/gilda/ner.py b/gilda/ner.py
index d904734..6feb574 100644
--- a/gilda/ner.py
+++ b/gilda/ner.py
@@ -140,19 +140,16 @@ def annotate(
             for span in sorted(applicable_spans, reverse=True):
                 # We have to reconstruct a text span while adding spaces
                 # where needed
-                txt_span = ''
                 raw_span = ''
-                for w, rw, c in zip(words[idx:idx+span],
-                                    raw_words[idx:idx+span],
+                for rw, c in zip(raw_words[idx:idx+span],
                                     raw_word_coords[idx:idx+span]):
                     # Figure out if we need a space before this word, then
                     # append the word.
                     spaces = ' ' * (c[0] - len(raw_span) -
                                     raw_word_coords[idx][0])
-                    txt_span += spaces + w
                     raw_span += spaces + rw
                 context = text if context_text is None else context_text
-                matches = grounder.ground(txt_span,
+                matches = grounder.ground(raw_span,
                                           context=context,
                                           organisms=organisms,
                                           namespaces=namespaces)
diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
index c8b24c1..6c54758 100644
--- a/gilda/tests/test_ner.py
+++ b/gilda/tests/test_ner.py
@@ -83,9 +83,9 @@ def test_context_test():
     context_text = "Calcium is released from the ER."
     results = gilda.annotate(text, context_text=context_text)
     assert len(results) == 1
-    assert results[1].matches[0].term.get_curie() == "GO:0005783"
-    assert results[1].text == "ER"
-    assert (results[1].start, results[0].end) == (14, 16)
+    assert results[0].matches[0].term.get_curie() == "GO:0005783"
+    assert results[0].text == "ER"
+    assert (results[0].start, results[0].end) == (14, 16)
 
 
 def test_punctuation_comma_in_entity():