Update to v0.2.0

Rename spacy model with `en_core_web_sm`. Fix special token handling for transformers lm. Update todos in readme.md.
sai-prasanna · Nov 27, 2019 · 98f1aea · 98f1aea
1 parent 2a7cff2
commit 98f1aea
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -26,8 +26,6 @@ Unlike many approaches to GEC, this approach does NOT require annotated training
 
 This work builds upon https://github.com/chrisjbryant/lmgec-lite/
 
-
-
 ## Components
 
 ### Language Models
@@ -42,8 +40,8 @@ Pre-trained language models for other languages, inflectors, common error patter
 
 ## TODOs
 
-* Research on distilling gpt-2 to a smaller model (LSTM?) to reduce the horrendous latency.
-* Experiment on GEC dev sets to obtain optimal thresholds.
-* Anyway to handle insertions.
+* Use edits in existing GEC corpus to generate candidates.
+* Tests
+* Publish benchmarks of the model.
+* Think of simple ways to generate insertion candidates.
 * Add more languages.
-* Check whether LemmInflect proposals are actually better than just using [AGID](https://github.com/sai-prasanna/lmgec-lite/tree/master/resources/agid-2016.01.19).
diff --git a/lmproof/candidate_generators.py b/lmproof/candidate_generators.py
@@ -96,7 +96,7 @@ def load(cls, language: str) -> "SpellCorrectGenerator":
                 / "frequency_dictionary_en_82_765.txt"
             )
             sym_spell.create_dictionary(str(dict_path))
-            spacy_model = spacy.load("en", disable=["parser", "ner"])
+            spacy_model = spacy.load("en_core_web_sm", disable=["parser", "ner"])
         else:
             raise RuntimeError(f"The language {language} is currently not language.")
         return cls(sym_spell, spacy_model)

diff --git a/lmproof/scorer.py b/lmproof/scorer.py
@@ -1,7 +1,7 @@
 from typing import List, Optional
+import logging
 
 import torch
-import logging
 from torch.nn import CrossEntropyLoss
 from transformers import (
     AutoTokenizer,
@@ -25,6 +25,7 @@ def __init__(
         model: PreTrainedModel,
         device: str = "cpu",
         batch_size: int = 1,
+        add_special_tokens: bool = False,
         normalize: bool = False,
     ):
         # Load pre-trained model tokenizer (vocabulary)
@@ -33,6 +34,7 @@ def __init__(
         self.model = model.to(self.device).eval()
         self.batch_size = batch_size
         self.normalize = normalize
+        self._add_special_tokens = add_special_tokens
         self._loss_fn = CrossEntropyLoss(ignore_index=-1)
 
     @classmethod
@@ -58,7 +60,9 @@ def score(self, sentences: List[str]) -> List[Optional[float]]:
 
             tokenized_batch = []
             for i, sentence in enumerate(batched_sentences):
-                tokens = self.tokenizer.encode(sentence)
+                tokens = self.tokenizer.encode(
+                    sentence, add_special_tokens=self._add_special_tokens
+                )
                 if len(tokens) <= self.tokenizer.max_len:
                     tokenized_batch.append(torch.LongTensor(tokens))  # type: ignore
                     batch_scored_idx.append(i)