Skip to content

Commit

Permalink
Update ner_spancat_compare for newer spacy, doc/vocab issues (#191)
Browse files Browse the repository at this point in the history
* Rename component to `transfer-ent` to avoid `.` in factory name (for
  spacy v3.4.2+)
* In convert script, make sure all spans belong to the same doc object
  when initializing span groups. Otherwise there can be cases where the
  span label is not in the doc vocab.
  • Loading branch information
adrianeboyd committed Apr 26, 2023
1 parent dbc1a2e commit 3a1cf48
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 9 deletions.
12 changes: 6 additions & 6 deletions experimental/ner_spancat_compare/configs/ner_assemble.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -55,23 +55,23 @@ component = "ner"
# Then, we instantiate a component from our `transfer-ent` factory so that
# we can move the entities in doc.ents into doc.spans.
[components.transfer-dna]
factory = "transfer-ent.v1"
factory = "transfer-ent"
span_key = ${paths.spans_key}

[components.transfer-rna]
factory = "transfer-ent.v1"
factory = "transfer-ent"
span_key = ${paths.spans_key}

[components.transfer-cell-line]
factory = "transfer-ent.v1"
factory = "transfer-ent"
span_key = ${paths.spans_key}

[components.transfer-cell-type]
factory = "transfer-ent.v1"
factory = "transfer-ent"
span_key = ${paths.spans_key}

[components.transfer-protein]
factory = "transfer-ent.v1"
factory = "transfer-ent"
span_key = ${paths.spans_key}


Expand Down Expand Up @@ -157,4 +157,4 @@ after_init = null

[initialize.components]

[initialize.tokenizer]
[initialize.tokenizer]
7 changes: 5 additions & 2 deletions experimental/ner_spancat_compare/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

import typer
from spacy.tokens import Doc, DocBin, SpanGroup
from spacy.tokens import Doc, DocBin, Span, SpanGroup
from spacy.training.converters import conll_ner_to_docs
from wasabi import msg

Expand Down Expand Up @@ -46,8 +46,11 @@ def convert_iob_to_docs(
docs_with_spans: List[Doc] = []

for docs in zip(*docs_per_level):
spans = [ent for doc in docs for ent in doc.ents]
doc = docs[0]
# recreate all spans for the same underlying doc
spans = []
for span in [ent for doc in docs for ent in doc.ents]:
spans.append(Span(doc, span.start, span.end, span.label_))
group = SpanGroup(doc, name=spans_key, spans=spans)
doc.spans[spans_key] = group
docs_with_spans.append(doc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from spacy.tokens.span_group import SpanGroup


@Language.factory("transfer-ent.v1", requires=["doc._.ents"])
@Language.factory("transfer-ent", requires=["doc._.ents"])
def make_transfer_component(nlp: Language, name: str, span_key: str):
return TransferEntComponent(nlp, name, span_key)

Expand Down

0 comments on commit 3a1cf48

Please sign in to comment.