Skip to content

Commit

Permalink
TextCat updates and fixes (#6263)
Browse files Browse the repository at this point in the history
* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
  • Loading branch information
svlandeg authored Oct 18, 2020
1 parent e2f3c4e commit 75a202c
Show file tree
Hide file tree
Showing 20 changed files with 231 additions and 123 deletions.
2 changes: 1 addition & 1 deletion spacy/about.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0rc1"
__version__ = "3.0.0rc2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/init_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def init_labels_cli(
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir()
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
Expand Down
32 changes: 20 additions & 12 deletions spacy/cli/templates/quickstart_training.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,18 @@ factory = "textcat"

{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
@architectures = "spacy.TextCatEnsemble.v2"
nO = null

[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
no_output_layer = false

{% else -%}
[components.textcat.model]
Expand Down Expand Up @@ -271,14 +275,18 @@ factory = "textcat"

{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
@architectures = "spacy.TextCatEnsemble.v2"
nO = null

[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
no_output_layer = false

{% else -%}
[components.textcat.model]
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def train_cli(
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists():
output_path.mkdir()
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
Expand Down
6 changes: 4 additions & 2 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ class Errors:
E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum")
E164 = ("x is neither increasing nor decreasing: {x}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.")
E165 = ("Only one class present in the gold labels: {label}. "
"ROC AUC score is not defined in that case.")
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}")
Expand Down Expand Up @@ -456,6 +456,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")

# TODO: fix numbering after merging develop into master
E897 = ("Field '{field}' should be a dot-notation string referring to the "
"relevant section in the config, but found type {type} instead.")
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your "
Expand Down
57 changes: 44 additions & 13 deletions spacy/ml/models/textcat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Optional
from typing import Optional, List

from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
Expand All @@ -10,28 +12,28 @@
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...tokens import Doc


@registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model:
) -> Model[List[Doc], Floats2d]:
"""
Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
is applied instead, so that outputs are in the range [0, 1].
"""
with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model = cnn >> output_layer
model.set_ref("output_layer", output_layer)
else:
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)
model = cnn >> linear_layer >> Logistic()
model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO)
Expand All @@ -45,8 +47,7 @@ def build_bow_text_classifier(
ngram_size: int,
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
Expand All @@ -59,6 +60,39 @@ def build_bow_text_classifier(
return model


@registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier(
tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO")
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width) # TODO: benchmark performance difference of this layer
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)

nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model

# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier(
width: int,
Expand Down Expand Up @@ -158,11 +192,8 @@ def build_text_classifier(

@registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(
width: int,
pretrained_vectors: Optional[bool],
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
width: int, dropout: Optional[float], nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]:
# Don't document this yet, I'm not sure it's right.
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}):
Expand Down
2 changes: 1 addition & 1 deletion spacy/ml/models/tok2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def MultiHashEmbed(
) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
through a feed-forward subnetwork to build a mixed representation.
The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
Expand Down
35 changes: 26 additions & 9 deletions spacy/pipeline/textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,30 @@

default_model_config = """
[model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
pretrained_vectors = null
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 64
conv_depth = 2
embed_size = 2000
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
dropout = null
no_output_layer = false
"""
DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]

Expand Down Expand Up @@ -60,9 +75,11 @@
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_p": None,
"cats_r": None,
"cats_f": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
Expand Down
30 changes: 19 additions & 11 deletions spacy/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def to_dict(self) -> Dict[str, float]:


class ROCAUCScore:
"""An AUC ROC score."""
"""An AUC ROC score. This is only defined for binary classification.
Use the method is_binary before calculating the score, otherwise it
may throw an error."""

def __init__(self) -> None:
self.golds = []
Expand All @@ -71,16 +73,16 @@ def score_set(self, cand, gold) -> None:
self.cands.append(cand)
self.golds.append(gold)

def is_binary(self):
return len(np.unique(self.golds)) == 2

@property
def score(self):
if not self.is_binary():
raise ValueError(Errors.E165.format(label=set(self.golds)))
if len(self.golds) == self.saved_score_at_len:
return self.saved_score
try:
self.saved_score = _roc_auc_score(self.golds, self.cands)
# catch ValueError: Only one class present in y_true.
# ROC AUC score is not defined in that case.
except ValueError:
self.saved_score = -float("inf")
self.saved_score = _roc_auc_score(self.golds, self.cands)
self.saved_score_at_len = len(self.golds)
return self.saved_score

Expand Down Expand Up @@ -362,9 +364,13 @@ def score_cats(
for all:
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score),
attr_micro_p,
attr_micro_r,
attr_micro_f,
attr_macro_p,
attr_macro_r,
attr_macro_f,
attr_auc,
attr_macro_auc,
attr_f_per_type,
attr_auc_per_type
Expand Down Expand Up @@ -431,7 +437,9 @@ def score_cats(
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
# Limit macro_auc to those labels with gold annotations,
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
results = {
f"{attr}_score": None,
f"{attr}_score_desc": None,
Expand All @@ -443,7 +451,7 @@ def score_cats(
f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": macro_auc,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
Expand Down Expand Up @@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score):
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
"""
if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165)
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr)

Expand Down
14 changes: 5 additions & 9 deletions spacy/tests/pipeline/test_pipe_factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear, ConfigValidationError
Expand Down Expand Up @@ -156,15 +157,10 @@ def test_pipe_class_component_model():
name = "test_class_component_model"
default_config = {
"model": {
"@architectures": "spacy.TextCatEnsemble.v1",
"exclusive_classes": False,
"pretrained_vectors": None,
"width": 64,
"embed_size": 2000,
"window_size": 1,
"conv_depth": 2,
"ngram_size": 1,
"dropout": None,
"@architectures": "spacy.TextCatEnsemble.v2",
"tok2vec": DEFAULT_TOK2VEC_MODEL,
"linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
"no_output_layer": False},
},
"value1": 10,
}
Expand Down
7 changes: 3 additions & 4 deletions spacy/tests/pipeline/test_textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def test_overfitting_IO():
nlp = English()
nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
# Set exclusive labels
config = {"model": {"exclusive_classes": True}}
config = {"model": {"linear_model": {"exclusive_classes": True}}}
textcat = nlp.add_pipe("textcat", config=config)
train_examples = []
for text, annotations in TRAIN_DATA:
Expand Down Expand Up @@ -192,9 +192,8 @@ def test_overfitting_IO():
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
{"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
],
Expand Down
Loading

0 comments on commit 75a202c

Please sign in to comment.