Skip to content

Commit

Permalink
Fix Dutch noun chunks to skip overlapping spans (#11275)
Browse files Browse the repository at this point in the history
* Add test for overlapping noun chunks

* Skip overlapping noun chunks

* Update spacy/tests/lang/nl/test_noun_chunks.py

Co-authored-by: Sofie Van Landeghem <[email protected]>

Co-authored-by: Sofie Van Landeghem <[email protected]>
  • Loading branch information
adrianeboyd and svlandeg committed Aug 10, 2022
1 parent 231a178 commit ed4ad30
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
11 changes: 7 additions & 4 deletions spacy/lang/nl/syntax_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
span_label = doc.vocab.strings.add("NP")

# Only NOUNS and PRONOUNS matter
end_span = -1
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
Expand All @@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
children_i = [c.i for c in children] + [word.i]

start_span = min(children_i)
end_span = max(children_i) + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = max(children_i) + 1
yield start_span, end_span, span_label

# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
end_span = word.i + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = word.i + 1
yield start_span, end_span, span_label


SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
18 changes: 17 additions & 1 deletion spacy/tests/lang/nl/test_noun_chunks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from spacy.tokens import Doc
import pytest
from spacy.tokens import Doc
from spacy.util import filter_spans


@pytest.fixture
Expand Down Expand Up @@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
"""
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
assert chunks == nl_reference_chunking


@pytest.mark.issue(10846)
def test_no_overlapping_chunks(nl_vocab):
# fmt: off
doc = Doc(
nl_vocab,
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
)
# fmt: on
chunks = list(doc.noun_chunks)
assert filter_spans(chunks) == chunks

0 comments on commit ed4ad30

Please sign in to comment.