Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NER: Ensure zero-cost sequence with sentence split in entity #12465

Draft
wants to merge 3 commits into
base: v5
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions spacy/pipeline/_parser_internals/ner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
return False


cdef bint _next_is_sent_start(const StateC* state) nogil:
return state.B(1) != -1 and state.B_(1).sent_start == 1


cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
Expand Down Expand Up @@ -388,7 +392,7 @@ cdef class Begin:
elif st.B_(1).ent_iob == 3:
# If the next word is B, we can't B now
return False
elif st.B_(1).sent_start == 1:
elif _next_is_sent_start(st):
# Don't allow entities to extend across sentence boundaries
return False
# Don't allow entities to start on whitespace
Expand Down Expand Up @@ -466,7 +470,7 @@ cdef class In:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
elif _next_is_sent_start(st):
# Don't allow entities to extend across sentence boundaries
return False
else:
Expand Down Expand Up @@ -558,8 +562,9 @@ cdef class Last:
# L, Gold B --> True
pass
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
cost += not _entity_is_sunk(s, gold.ner)
# L, Gold I --> True iff this entity sunk or there is sentence
# break after the next buffer token.
cost += not (_entity_is_sunk(s, gold.ner) or _next_is_sent_start(s))
elif g_act == LAST:
# L, Gold L --> True
pass
Expand Down Expand Up @@ -674,8 +679,9 @@ cdef class Out:
if g_act == MISSING:
pass
elif g_act == BEGIN:
# O, Gold B --> False
cost += 1
# O, Gold B --> False, unless there is a sentence break after
# the next buffer token.
cost += not _next_is_sent_start(s)
elif g_act == IN:
# O, Gold I --> True
pass
Expand Down
25 changes: 25 additions & 0 deletions spacy/tests/parser/test_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,31 @@ def test_ner_warns_no_lookups(caplog):
assert "W033" not in caplog.text


def test_train_sent_split_in_entity():
# Check that we can train on inputs when entities are sentence-split
# by an annotating component.
nlp = English()
ner = nlp.add_pipe("ner", config={"update_with_oracle_cut_size": 3})

eg = Example.from_dict(
nlp.make_doc("I like the Kinesis Advantage2 LF very much."),
{"entities": [(11, 32, "MISC")]},
)

# Go bezerk, put a boundary on every combination of tokens.
train_examples = []
for i in range(1, len(eg.predicted)):
for j in range(1, len(eg.predicted)):
eg_ij = eg.copy()
eg_ij.predicted[i].is_sent_start = True
eg_ij.predicted[j].is_sent_start = True
train_examples.append(eg_ij)

ner.add_label("MISC")
nlp.initialize()
nlp.update(train_examples, sgd=False, annotates=[])


@Language.factory("blocker")
class BlockerComponent1:
def __init__(self, nlp, start, end, name="my_blocker"):
Expand Down
Loading