Skip to content

Commit

Permalink
Implement sentence offset tracking with spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
bgyori committed Jul 24, 2024
1 parent c471935 commit 72fe19c
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 11 deletions.
22 changes: 14 additions & 8 deletions gilda/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,8 @@

from typing import List

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer

from gilda import get_grounder
from gilda.grounder import Annotation
Expand Down Expand Up @@ -105,14 +103,20 @@ def annotate(
"""
if grounder is None:
grounder = get_grounder()
sent_tokenizer = PunktSentenceTokenizer()
if sent_split_fun is None:
sent_split_fun = sent_tokenize
sent_split_fun = sent_tokenizer.tokenize
# Get sentences
sentences = sent_split_fun(text)
sentence_coords = list(sent_tokenizer.span_tokenize(text))
text_coord = 0
annotations = []
word_tokenizer = TreebankWordTokenizer()
for sentence in sentences:
# FIXME: a custom sentence split function can be inconsistent
# with the coordinates being used here which come from NLTK
for sentence, sentence_coord in zip(sentences, sentence_coords):
# FIXME: one rare corner case is named entities with single quotes
# in them which get tokenized in a weird way
raw_word_coords = \
list(word_tokenizer.span_tokenize(sentence.rstrip('.')))
raw_words = [sentence[start:end] for start, end in raw_word_coords]
Expand Down Expand Up @@ -143,7 +147,8 @@ def annotate(
raw_word_coords[idx:idx+span]):
# Figure out if we need a space before this word, then
# append the word.
spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0])
spaces = ' ' * (c[0] - len(raw_span) -
raw_word_coords[idx][0])
txt_span += spaces + w
raw_span += spaces + rw
context = text if context_text is None else context_text
Expand All @@ -152,8 +157,9 @@ def annotate(
organisms=organisms,
namespaces=namespaces)
if matches:
start_coord = raw_word_coords[idx][0]
end_coord = raw_word_coords[idx+span-1][1]
start_coord = sentence_coord[0] + raw_word_coords[idx][0]
end_coord = sentence_coord[0] + \
raw_word_coords[idx+span-1][1]
annotations.append(Annotation(
raw_span, matches, start_coord, end_coord
))
Expand Down
6 changes: 3 additions & 3 deletions gilda/tests/test_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def test_context_test():
context_text = "Calcium is released from the ER."
results = gilda.annotate(text, context_text=context_text)
assert len(results) == 1
assert results[0].matches[0].term.get_curie() == "GO:0005783"
assert results[0].text == "ER"
assert (results[0].start, results[0].end) == (14, 16)
assert results[1].matches[0].term.get_curie() == "GO:0005783"
assert results[1].text == "ER"
assert (results[1].start, results[0].end) == (14, 16)


def test_punctuation_comma_in_entity():
Expand Down

0 comments on commit 72fe19c

Please sign in to comment.