Skip to content

Commit

Permalink
Merge pull request #143 from gyorilab/annotate_matches
Browse files Browse the repository at this point in the history
Make annotation return a list of Annotation object with a list of ScoredMatches
  • Loading branch information
bgyori authored Jul 17, 2024
2 parents 52c8b79 + c630da5 commit 306e6c7
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 90 deletions.
6 changes: 4 additions & 2 deletions gilda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import logging

from .api import get_grounder, get_models, get_names, ground, make_grounder, annotate
from .grounder import Grounder, ScoredMatch
from .api import get_grounder, get_models, get_names, ground, make_grounder, \
annotate
from .grounder import Grounder, ScoredMatch, Annotation
from .pandas_utils import ground_df, ground_df_map
from .term import Term, dump_terms

Expand All @@ -19,6 +20,7 @@
'Term',
'Grounder',
'ScoredMatch',
'Annotation',
# Meta
'__version__',
# Pandas utilities
Expand Down
21 changes: 9 additions & 12 deletions gilda/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from typing import List, Mapping, Union, Optional

from gilda.grounder import Grounder
from gilda.grounder import Grounder, Annotation
from gilda.term import Term


Expand Down Expand Up @@ -112,9 +112,8 @@ def annotate(
sent_split_fun=None,
organisms=None,
namespaces=None,
return_first: bool = True,
context_text: str = None,
):
) -> List[Annotation]:
"""Annotate a given text with Gilda (i.e., do named entity recognition).
Parameters
Expand All @@ -132,28 +131,26 @@ def annotate(
namespaces : list[str], optional
A list of namespaces to pass to the grounder to restrict the matches
to. By default, no restriction is applied.
return_first :
If true, only returns the first result. Otherwise, returns all results.
context_text :
A longer span of text that serves as additional context for the text
being annotated for disambiguation purposes.
Returns
-------
list[tuple[str, ScoredMatch, int, int]]
A list of tuples of start and end character offsets of the text
corresponding to the entity, the entity text, and the ScoredMatch
object corresponding to the entity.
list[Annotation]
A list of matches where each match is an Annotation object
which contains as attributes the text span that was matches,
the list of ScoredMatches, and the start and end character offsets of
the text span.
"""
from .ner import annotate as _annotate
import gilda.ner

return _annotate(
return gilda.ner.annotate(
text,
grounder=grounder,
sent_split_fun=sent_split_fun,
organisms=organisms,
namespaces=namespaces,
return_first=return_first,
context_text=context_text,
)

Expand Down
11 changes: 7 additions & 4 deletions gilda/app/templates/ner_matches.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,20 @@ <h3 class="panel-title">NER Results</h3>
<thead>
<tr>
<th>Span</th>
<th>Text</th>
<th>Grounding</th>
<th>Name</th>
<th>Standard name</th>
<th>Score</th>
<th>Additional Groundings</th>
<th>Additional groundings</th>
</tr>
</thead>
<tbody>
{% for text, match, start, end in annotations %}
{% for annotation in annotations %}
<tr>
{% set match = annotation['matches'][0] %}
{% set match_curie = match.term.get_curie() %}
<td>{{start}}-{{end}}</td>
<td>{{ annotation['start'] }}-{{ annotation['end'] }}</td>
<td>{{ annotation['text'] }}</td>
<td>
<a class="label label-primary" href="{{ match['url'] }}">
{{ match_curie }}
Expand Down
32 changes: 32 additions & 0 deletions gilda/grounder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"Grounder",
"GrounderInput",
"ScoredMatch",
"Annotation",
"load_terms_file",
"load_entries_from_terms_file",
"filter_for_organism",
Expand Down Expand Up @@ -652,6 +653,37 @@ def get_grounding_dict(self) -> Mapping[str, str]:
}


class Annotation:
"""A class representing a named entity annotation in a given text.
Attributes
----------
text : str
The text span that was annotated.
matches : list[ScoredMatch]
The list of scored matches for the text span.
start : int
The start character offset of the text span.
end : int
The end character offset of the text span.
"""
def __init__(self, text: str, matches: List[ScoredMatch], start: int,
end: int):
self.text = text
self.matches = matches
self.start = start
self.end = end

def __repr__(self):
return str(self)

def __str__(self):
return (f"Annotation({self.text}, {self.matches}, {self.start}, "
f"{self.end})")




def load_entries_from_terms_file(terms_file: Union[str, Path]) -> Iterator[Term]:
"""Yield Terms from a compressed terms TSV file path.
Expand Down
80 changes: 36 additions & 44 deletions gilda/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,25 @@
>>> text = "MEK phosphorylates ERK"
>>> results = annotate(text)
The results are a list of 4-tuples containing:
The results are a list of Annotation objects each of which contains:
- the text string matched
- a :class:`gilda.grounder.ScoredMatch` instance containing the _best_ match
- the position in the text string where the entity starts
- the position in the text string where the entity ends
- the `text` string matched
- a list of :class:`gilda.grounder.ScoredMatch` instances containing a sorted list of matches
for the given text span (first one is the best match)
- the `start` position in the text string where the entity starts
- the `end` position in the text string where the entity ends
In this example, the two concepts are grounded to FamPlex entries.
>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3]
>>> results[0].text, results[0].matches[0].term.get_curie(), results[0].start, results[0].end
('MEK', 'fplx:MEK', 0, 3)
>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3]
>>> results[1].text, results[1].matches[0].term.get_curie(), results[1].start, results[1].end
('ERK', 'fplx:ERK', 19, 22)
If you directly look in the second part of the 4-tuple, you get a full
description of the match itself:
>>> results[0][1]
>>> results[0].matches[0]
ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))
Expand All @@ -44,32 +45,30 @@
same name but extension ``.ann``.
"""

from typing import List, Tuple
from typing import List

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

from gilda import ScoredMatch, get_grounder
from gilda import get_grounder
from gilda.grounder import Annotation
from gilda.process import normalize

__all__ = [
"annotate",
"get_brat",
"Annotation",
"stop_words"
]

stop_words = set(stopwords.words('english'))

Annotation = Tuple[str, ScoredMatch, int, int]


def annotate(
text, *,
grounder=None,
sent_split_fun=None,
organisms=None,
namespaces=None,
return_first: bool = True,
context_text: str = None,
) -> List[Annotation]:
"""Annotate a given text with Gilda.
Expand All @@ -85,24 +84,22 @@ def annotate(
:func:`nltk.tokenize.sent_tokenize`. The function should take a string
as input and return an iterable of strings corresponding to the sentences
in the input text.
organisms : list[str], optional
organisms : List[str], optional
A list of organism names to pass to the grounder. If not provided,
human is used.
namespaces : list[str], optional
namespaces : List[str], optional
A list of namespaces to pass to the grounder to restrict the matches
to. By default, no restriction is applied.
return_first :
If true, only returns the first result. Otherwise, returns all results.
context_text :
A longer span of text that serves as additional context for the text
being annotated for disambiguation purposes.
Returns
-------
list[tuple[str, ScoredMatch, int, int]]
A list of tuples of start and end character offsets of the text
corresponding to the entity, the entity text, and the ScoredMatch
object corresponding to the entity.
List[Annotation]
A list of Annotations where each contains as attributes
the text span that was matched, the list of ScoredMatches, and the
start and end character offsets of the text span.
"""
if grounder is None:
grounder = get_grounder()
Expand All @@ -111,7 +108,7 @@ def annotate(
# Get sentences
sentences = sent_split_fun(text)
text_coord = 0
entities = []
annotations = []
for sentence in sentences:
raw_words = [w for w in sentence.rstrip('.').split()]
word_coords = [text_coord]
Expand All @@ -136,36 +133,32 @@ def annotate(
# Find the largest matching span
for span in sorted(applicable_spans, reverse=True):
txt_span = ' '.join(raw_words[idx:idx+span])
matches = grounder.ground(
txt_span, context=text if context_text is None else context_text,
organisms=organisms, namespaces=namespaces,
)
context = text if context_text is None else context_text
matches = grounder.ground(txt_span,
context=context,
organisms=organisms,
namespaces=namespaces)
if matches:
start_coord = word_coords[idx]
end_coord = word_coords[idx+span-1] + \
len(raw_words[idx+span-1])
raw_span = ' '.join(raw_words[idx:idx+span])

if return_first:
matches = [matches[0]]
for match in matches:
entities.append(
(raw_span, match, start_coord, end_coord)
)
annotations.append(Annotation(
raw_span, matches, start_coord, end_coord
))

skip_until = idx + span
break
return entities
return annotations


def get_brat(entities, entity_type="Entity", ix_offset=1, include_text=True):
def get_brat(annotations, entity_type="Entity", ix_offset=1, include_text=True):
"""Return brat-formatted annotation strings for the given entities.
Parameters
----------
entities : list[tuple[str, str | ScoredMatch, int, int]]
A list of tuples of entity text, grounded curie, start and end
character offsets in the text corresponding to an entity.
annotations : list[Annotation]
A list of named entity annotations in the text.
entity_type : str, optional
The brat entity type to use for the annotations. The default is
'Entity'. This is useful for differentiating between annotations in
Expand All @@ -184,13 +177,12 @@ def get_brat(entities, entity_type="Entity", ix_offset=1, include_text=True):
"""
brat = []
ix_offset = max(1, ix_offset)
for idx, (raw_span, curie, start, end) in enumerate(entities, ix_offset):
if isinstance(curie, ScoredMatch):
curie = curie.term.get_curie()
for idx, annotation in enumerate(annotations, ix_offset):
curie = annotation.matches[0].term.get_curie()
if entity_type != "Entity":
curie += f"; Reading system: {entity_type}"
row = f'T{idx}\t{entity_type} {start} {end}' + (
f'\t{raw_span}' if include_text else ''
row = f'T{idx}\t{entity_type} {annotation.start} {annotation.end}' + (
f'\t{annotation.text}' if include_text else ''
)
brat.append(row)
row = f'#{idx}\tAnnotatorNotes T{idx}\t{curie}'
Expand Down
Loading

0 comments on commit 306e6c7

Please sign in to comment.