TextCat updates and fixes (#6263)

* small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos
explosion · Oct 18, 2020 · 75a202c · 75a202c
1 parent e2f3c4e
commit 75a202c
Show file tree

Hide file tree

Showing 20 changed files with 231 additions and 123 deletions.
diff --git a/spacy/about.py b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0rc1"
+__version__ = "3.0.0rc2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
@@ -100,7 +100,7 @@ def init_labels_cli(
     extract the labels."""
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     if not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
@@ -136,14 +136,18 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
 ngram_size = 1
-nO = null
+no_output_layer = false
 
 {% else -%}
 [components.textcat.model]
@@ -271,14 +275,18 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
 ngram_size = 1
-nO = null
+no_output_layer = false
 
 {% else -%}
 [components.textcat.model]

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
@@ -44,7 +44,7 @@ def train_cli(
     if not config_path or not config_path.exists():
         msg.fail("Config file not found", config_path, exits=1)
     if output_path is not None and not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
         msg.good(f"Created output directory: {output_path}")
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)

diff --git a/spacy/errors.py b/spacy/errors.py
@@ -398,8 +398,8 @@ class Errors:
     E163 = ("cumsum was found to be unstable: its last element does not "
             "correspond to sum")
     E164 = ("x is neither increasing nor decreasing: {x}.")
-    E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
-            "that case.")
+    E165 = ("Only one class present in the gold labels: {label}. "
+            "ROC AUC score is not defined in that case.")
     E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
@@ -456,6 +456,8 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E897 = ("Field '{field}' should be a dot-notation string referring to the "
+            "relevant section in the config, but found type {type} instead.")
     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
             "is not set or None. If you've implemented a custom component, make "
             "sure to store the component model as `self.model` in your "

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
@@ -1,4 +1,6 @@
-from typing import Optional
+from typing import Optional, List
+
+from thinc.types import Floats2d
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@@ -10,28 +12,28 @@
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
+from ...tokens import Doc
 
 
 @registry.architectures.register("spacy.TextCatCNN.v1")
 def build_simple_cnn_text_classifier(
     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
-) -> Model:
+) -> Model[List[Doc], Floats2d]:
     """
     Build a simple CNN text classifier, given a token-to-vector model as inputs.
     If exclusive_classes=True, a softmax non-linearity is applied, so that the
     outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
     is applied instead, so that outputs are in the range [0, 1].
     """
     with Model.define_operators({">>": chain}):
+        cnn = tok2vec >> list2ragged() >> reduce_mean()
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+            model = cnn >> output_layer
             model.set_ref("output_layer", output_layer)
         else:
             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = (
-                tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
-            )
+            model = cnn >> linear_layer >> Logistic()
             model.set_ref("output_layer", linear_layer)
     model.set_ref("tok2vec", tok2vec)
     model.set_dim("nO", nO)
@@ -45,8 +47,7 @@ def build_bow_text_classifier(
     ngram_size: int,
     no_output_layer: bool,
     nO: Optional[int] = None,
-) -> Model:
-    # Don't document this yet, I'm not sure it's right.
+) -> Model[List[Doc], Floats2d]:
     with Model.define_operators({">>": chain}):
         sparse_linear = SparseLinear(nO)
         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@@ -59,6 +60,39 @@ def build_bow_text_classifier(
     return model
 
 
+@registry.architectures.register("spacy.TextCatEnsemble.v2")
+def build_text_classifier(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    linear_model: Model[List[Doc], Floats2d],
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    exclusive_classes = not linear_model.attrs["multi_label"]
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.get_dim("nO")
+        cnn_model = (
+                tok2vec
+                >> list2ragged()
+                >> ParametricAttention(width)   # TODO: benchmark performance difference of this layer
+                >> reduce_sum()
+                >> residual(Maxout(nO=width, nI=width))
+                >> Linear(nO=nO, nI=width)
+                >> Dropout(0.0)
+        )
+
+        nO_double = nO * 2 if nO else None
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO, nI=nO_double)
+        else:
+            output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
+        model = (linear_model | cnn_model) >> output_layer
+        model.set_ref("tok2vec", tok2vec)
+    if model.has_dim("nO") is not False:
+        model.set_dim("nO", nO)
+    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
+    model.attrs["multi_label"] = not exclusive_classes
+    return model
+
+# TODO: move to legacy
 @registry.architectures.register("spacy.TextCatEnsemble.v1")
 def build_text_classifier(
     width: int,
@@ -158,11 +192,8 @@ def build_text_classifier(
 
 @registry.architectures.register("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
-    width: int,
-    pretrained_vectors: Optional[bool],
-    dropout: Optional[float],
-    nO: Optional[int] = None,
-) -> Model:
+    width: int, dropout: Optional[float], nO: Optional[int] = None
+) -> Model[List[Doc], Floats2d]:
     # Don't document this yet, I'm not sure it's right.
     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
     with Model.define_operators({">>": chain, "**": clone}):

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
@@ -106,7 +106,7 @@ def MultiHashEmbed(
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
-    through a feed-forward subnetwork to build a mixed representations.
+    through a feed-forward subnetwork to build a mixed representation.
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
@@ -16,15 +16,30 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-pretrained_vectors = null
+@architectures = "spacy.TextCatEnsemble.v2"
+
+[model.tok2vec]
+@architectures = "spacy.Tok2Vec.v1"
+
+[model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
 width = 64
-conv_depth = 2
-embed_size = 2000
+rows = [2000, 2000, 1000, 1000, 1000, 1000]
+attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+include_static_vectors = false
+
+[model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${model.tok2vec.embed.width}
 window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
 ngram_size = 1
-dropout = null
+no_output_layer = false
 """
 DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
 
@@ -60,9 +75,11 @@
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
-        "cats_p": None,
-        "cats_r": None,
-        "cats_f": None,
+        "cats_micro_p": None,
+        "cats_micro_r": None,
+        "cats_micro_f": None,
+        "cats_macro_p": None,
+        "cats_macro_r": None,
         "cats_macro_f": None,
         "cats_macro_auc": None,
         "cats_f_per_type": None,

diff --git a/spacy/scorer.py b/spacy/scorer.py
@@ -59,7 +59,9 @@ def to_dict(self) -> Dict[str, float]:
 
 
 class ROCAUCScore:
-    """An AUC ROC score."""
+    """An AUC ROC score. This is only defined for binary classification.
+    Use the method is_binary before calculating the score, otherwise it
+    may throw an error."""
 
     def __init__(self) -> None:
         self.golds = []
@@ -71,16 +73,16 @@ def score_set(self, cand, gold) -> None:
         self.cands.append(cand)
         self.golds.append(gold)
 
+    def is_binary(self):
+        return len(np.unique(self.golds)) == 2
+
     @property
     def score(self):
+        if not self.is_binary():
+            raise ValueError(Errors.E165.format(label=set(self.golds)))
         if len(self.golds) == self.saved_score_at_len:
             return self.saved_score
-        try:
-            self.saved_score = _roc_auc_score(self.golds, self.cands)
-        # catch ValueError: Only one class present in y_true.
-        # ROC AUC score is not defined in that case.
-        except ValueError:
-            self.saved_score = -float("inf")
+        self.saved_score = _roc_auc_score(self.golds, self.cands)
         self.saved_score_at_len = len(self.golds)
         return self.saved_score
 
@@ -362,9 +364,13 @@ def score_cats(
             for all:
                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
                 attr_score_desc (text description of the overall score),
+                attr_micro_p,
+                attr_micro_r,
                 attr_micro_f,
+                attr_macro_p,
+                attr_macro_r,
                 attr_macro_f,
-                attr_auc,
+                attr_macro_auc,
                 attr_f_per_type,
                 attr_auc_per_type
 
@@ -431,7 +437,9 @@ def score_cats(
         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
-        macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
+        # Limit macro_auc to those labels with gold annotations,
+        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
+        macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
         results = {
             f"{attr}_score": None,
             f"{attr}_score_desc": None,
@@ -443,7 +451,7 @@ def score_cats(
             f"{attr}_macro_f": macro_f,
             f"{attr}_macro_auc": macro_auc,
             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
-            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+            f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
         }
         if len(labels) == 2 and not multi_label and positive_label:
             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score):
             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
     """
     if len(np.unique(y_true)) != 2:
-        raise ValueError(Errors.E165)
+        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
     fpr, tpr, _ = _roc_curve(y_true, y_score)
     return _auc(fpr, tpr)
 

diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
@@ -2,6 +2,7 @@
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
 from spacy.util import registry, SimpleFrozenDict, combine_score_weights
 from thinc.api import Model, Linear, ConfigValidationError
@@ -156,15 +157,10 @@ def test_pipe_class_component_model():
     name = "test_class_component_model"
     default_config = {
         "model": {
-            "@architectures": "spacy.TextCatEnsemble.v1",
-            "exclusive_classes": False,
-            "pretrained_vectors": None,
-            "width": 64,
-            "embed_size": 2000,
-            "window_size": 1,
-            "conv_depth": 2,
-            "ngram_size": 1,
-            "dropout": None,
+            "@architectures": "spacy.TextCatEnsemble.v2",
+            "tok2vec": DEFAULT_TOK2VEC_MODEL,
+            "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
+                      "no_output_layer": False},
         },
         "value1": 10,
     }

diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
@@ -140,7 +140,7 @@ def test_overfitting_IO():
     nlp = English()
     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
     # Set exclusive labels
-    config = {"model": {"exclusive_classes": True}}
+    config = {"model": {"linear_model": {"exclusive_classes": True}}}
     textcat = nlp.add_pipe("textcat", config=config)
     train_examples = []
     for text, annotations in TRAIN_DATA:
@@ -192,9 +192,8 @@ def test_overfitting_IO():
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
     ],