Skip to content

Commit

Permalink
Add offsets_from_biluo_tags helper and tests (see explosion#1626)
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 26, 2017
1 parent e4ee666 commit c699aec
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 1 deletion.
19 changes: 19 additions & 0 deletions spacy/gold.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -541,5 +541,24 @@ def biluo_tags_from_offsets(doc, entities, missing='O'):
return biluo


def offsets_from_biluo_tags(doc, tags):
"""Encode per-token tags following the BILUO scheme into entity offsets.
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
`end` will be character-offset integers denoting the slice into the
original string.
"""
token_offsets = tags_to_entities(tags)
offsets = []
for label, start_idx, end_idx in token_offsets:
span = doc[start_idx : end_idx + 1]
offsets.append((span.start_char, span.end_char, label))
return offsets


def is_punct_label(label):
return label == 'P' or label.lower() == 'punct'
13 changes: 12 additions & 1 deletion spacy/tests/gold/test_biluo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals

from ...gold import biluo_tags_from_offsets
from ...gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from ...tokens.doc import Doc

import pytest
Expand Down Expand Up @@ -41,3 +41,14 @@ def test_gold_biluo_misalign(en_vocab):
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'O', '-', '-', '-']


def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
text = "I flew to Silicon Valley via London."
biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
doc = en_tokenizer(text)
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
assert biluo_tags_converted == biluo_tags
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
assert offsets_converted == offsets

0 comments on commit c699aec

Please sign in to comment.