Switch FastWordpieceTokenizer to default to running pre-tokenization, and rename the end_to_end parameter to no_pretokenization. This should be a no-op. The flatbuffer is not changed so as to not affect any models already using FWP currently. Only the python API is updated.

broken · Manish Mohan · commit e34063227410 · 2021-11-12T05:54:05.000Z
PiperOrigin-RevId: 409197312
diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc
@@ -169,12 +169,13 @@ class StringVocab : public WordpieceVocab {
 // Builds the FastWordpieceTokenizer model.
 class FastWordpieceBuilder {
  public:
-  // When end_to_end is true, we split the input string by punctuation chars
-  // (in addition to whitespaces) and then tokenize it to wordpieces.
+  // When no_pretokenization is false, we split the input string by punctuation
+  // chars (in addition to whitespaces) and then tokenize it to wordpieces.
   absl::Status BuildModel(const std::vector<std::string>& vocab,
                           int max_bytes_per_token,
                           absl::string_view suffix_indicator,
-                          absl::string_view unk_token, bool end_to_end,
+                          absl::string_view unk_token,
+                          bool no_pretokenization,
                           bool support_detokenization);
 
   absl::StatusOr<std::string> ExportToFlatBuffer() const;
@@ -267,10 +268,10 @@ class FastWordpieceBuilder {
   uint32_t trie_punct_failure_link_node_ =
       fast_wordpiece_tokenizer_utils::kNullNode;
 
-  // Whether to build the end-to-end tokenizer that tokenize general texts. It
-  // splits the input on punctuation/whitespace and treat each punctuation as an
-  // independent word.
-  bool end_to_end_;
+  // Whether to build the end-to-end tokenizer that tokenizes general texts.
+  // When set to false, it splits the input on punctuation/whitespace and treat
+  // each punctuation as an independent word.
+  bool no_pretokenization_;
 
   // Whether the tokenizer supports the detokenization function.
   bool support_detokenization_;
@@ -294,11 +295,11 @@ class FastWordpieceBuilder {
 absl::Status FastWordpieceBuilder::BuildModel(
     const std::vector<std::string>& vocab, int max_bytes_per_token,
     absl::string_view suffix_indicator, absl::string_view unk_token,
-    bool end_to_end, bool support_detokenization) {
+    bool no_pretokenization, bool support_detokenization) {
   unk_token_ = std::string(unk_token);
   suffix_indicator_ = std::string(suffix_indicator);
   max_bytes_per_token_ = max_bytes_per_token;
-  end_to_end_ = end_to_end;
+  no_pretokenization_ = no_pretokenization;
   support_detokenization_ = support_detokenization;
 
   vocab_.emplace(vocab);
@@ -397,7 +398,7 @@ FastWordpieceBuilder::PrepareVocabTokensToBuildTrie() {
     }
     // Skip word that contains punctuation but is not a punctuation itself.
     // <unk>, <pad>, ##. are skipped in this step.
-    if (end_to_end_ && vocab_token.ContainsPunctuation() &&
+    if (!no_pretokenization_ && vocab_token.ContainsPunctuation() &&
         (vocab_token.TokenUnicodeLengthWithoutSuffixIndicator() > 1 ||
          vocab_token.IsSuffixToken())) {
       continue;
@@ -425,7 +426,7 @@ FastWordpieceBuilder::PrepareVocabTokensToBuildTrie() {
     }
   }
 
-  if (end_to_end_) {
+  if (!no_pretokenization_) {
     // Special treatment for all Unicode punctuation chars that are not already
     // in the trie.
     // The maximum codepoint in Unicode is 0x0010FFFF.
@@ -490,7 +491,7 @@ absl::Status FastWordpieceBuilder::ConstructTrie(
   }
   trie_suffix_root_ = node.node_id;
 
-  if (end_to_end_) {
+  if (!no_pretokenization_) {
     // Locate the dummy node for the failure link for punctuation nodes.
     node = trie_->CreateTraversalCursorPointToRoot();
     if (!trie_->TryTraverseSeveralSteps(node,
@@ -616,7 +617,8 @@ absl::Status FastWordpieceBuilder::BuildFailureStructure(
               "Failed to find if an end node in the trie is a punctuation char "
               "in node_id_is_punc_map_. It should never happen.");
         }
-        if (end_to_end_ && node_id_is_punc_map_.at(child_node.node_id)) {
+        if (!no_pretokenization_ &&
+            node_id_is_punc_map_.at(child_node.node_id)) {
           // For end-to-end tokenizer, we set the failure link node of every
           // punctuation char as a special node trie_punct_failure_link_node_
           // which is a dummy node (no parent, no descendants, failure link is
@@ -718,7 +720,7 @@ absl::Status FastWordpieceBuilder::BuildFailureStructure(
     }
   }
 
-  if (end_to_end_ && !suffix_indicator_.empty()) {
+  if (!no_pretokenization_ && !suffix_indicator_.empty()) {
     // Rewire trie links along suffix_indicator_.
     // If the suffix indicator contains a punctuation char, let `u`--(`c`)-->`v`
     // be the first trie edge along the suffix indicator such that the edge
@@ -937,7 +939,7 @@ absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
   wtcb.add_unk_token_id(unk_token_id_);
   wtcb.add_precomputed_result_for_suffix_indicator(
       precomputed_result_for_suffix_indicator);
-  wtcb.add_end_to_end(end_to_end_);
+  wtcb.add_end_to_end(!no_pretokenization_);
   wtcb.add_support_detokenization(support_detokenization_);
   wtcb.add_vocab_array(vocab_array);
   wtcb.add_vocab_is_suffix_array(vocab_is_suffix_array);
@@ -950,10 +952,11 @@ absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
 absl::StatusOr<std::string> BuildModelAndExportToFlatBuffer(
     const std::vector<std::string>& vocab, int max_bytes_per_token,
     absl::string_view suffix_indicator, absl::string_view unk_token,
-    bool end_to_end, bool support_detokenization) {
+    bool no_pretokenization, bool support_detokenization) {
   FastWordpieceBuilder builder;
   SH_RETURN_IF_ERROR(builder.BuildModel(vocab, max_bytes_per_token,
-                                        suffix_indicator, unk_token, end_to_end,
+                                        suffix_indicator, unk_token,
+                                        no_pretokenization,
                                         support_detokenization));
   SH_ASSIGN_OR_RETURN(std::string flatbuffer, builder.ExportToFlatBuffer());
   return flatbuffer;
diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h
@@ -32,8 +32,8 @@ namespace text {
 //  * suffix_indicator: Characters prepended to a wordpiece to indicate that
 //    it is a suffix to another subword, such as "##".
 //  * unk_token: The unknown token string.
-//  * end_to_end: Whether to build end-to-end tokneizer.
-//    Set to `true` when the model is used for general text end-to-end
+//  * no_pretokenization: Whether to pretokenize on punctuation & whitespace.
+//    Set to `false` when the model is used for general text end-to-end
 //    tokenization, which combines pre-tokenization (splitting text into words
 //    on punctuation/whitespaces) and WordPiece (breaking words into subwords)
 //    into one pass.
@@ -46,7 +46,7 @@ namespace text {
 absl::StatusOr<std::string> BuildModelAndExportToFlatBuffer(
     const std::vector<std::string>& vocab, int max_bytes_per_token,
     absl::string_view suffix_indicator, absl::string_view unk_token,
-    bool end_to_end = false, bool support_detokenization = false);
+    bool no_pretokenization = false, bool support_detokenization = false);
 }  // namespace text
 }  // namespace tensorflow
 
diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc
@@ -1143,7 +1143,8 @@ TEST_P(TestTokenizeSingleWord, Test) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token));
+                                      spec.suffix_indicator, spec.unk_token,
+                                      /*no_pretokenization=*/true));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -1164,7 +1165,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPieces) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token));
+                                      spec.suffix_indicator, spec.unk_token,
+                                      true /* no_pretokenization */));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -1183,7 +1185,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesOnlyOutputIds) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token));
+                                      spec.suffix_indicator, spec.unk_token,
+                                      true /* no_pretokenization */));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -1198,7 +1201,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesWithPositiveSentenceOffsets) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token));
+                                      spec.suffix_indicator, spec.unk_token,
+                                      true /* no_pretokenization */));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -2365,8 +2369,7 @@ TEST_P(TestTokenizeText, Test) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token,
-                                      /*end_to_end=*/true));
+                                      spec.suffix_indicator, spec.unk_token));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -2387,8 +2390,7 @@ TEST_P(TestTokenizeText, TestNoOutputPieces) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token,
-                                      /*end_to_end=*/true));
+                                      spec.suffix_indicator, spec.unk_token));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -2407,8 +2409,7 @@ TEST_P(TestTokenizeText, TestNoOutputPiecesOnlyOutputIds) {
   ASSERT_OK_AND_ASSIGN(
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
-                                      spec.suffix_indicator, spec.unk_token,
-                                      /*end_to_end=*/true));
+                                      spec.suffix_indicator, spec.unk_token));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
 
@@ -2466,7 +2467,7 @@ TEST_P(TestTokenizeDetokenize, Test) {
       std::string flatbuffer,
       BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
                                       spec.suffix_indicator, spec.unk_token,
-                                      /*end_to_end=*/false,
+                                      /*no_pretokenization=*/true,
                                       /*support_detokenization=*/true));
   ASSERT_OK_AND_ASSIGN(auto tokenizer,
                        FastWordpieceTokenizer::Create(flatbuffer.data()));
diff --git a/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc
@@ -27,10 +27,10 @@ PYBIND11_MODULE(pywrap_fast_wordpiece_tokenizer_model_builder, m) {
   m.def("build_fast_wordpiece_model",
         [](const std::vector<std::string>& vocab, int max_bytes_per_token,
            const std::string& suffix_indicator, const std::string& unk_token,
-           bool end_to_end, bool support_detokenization) {
+           bool no_pretokenization, bool support_detokenization) {
           const auto result = BuildModelAndExportToFlatBuffer(
               vocab, max_bytes_per_token, suffix_indicator, unk_token,
-              end_to_end, support_detokenization);
+              no_pretokenization, support_detokenization);
           if (!result.status().ok()) {
             // Propagate the error to the Python code.
             throw std::runtime_error(std::string(result.status().message()));
diff --git a/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder_test.py b/tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder_test.py
@@ -41,7 +41,7 @@ def test_build(self):
     self.assertEqual(
         pywrap_fast_wordpiece_tokenizer_model_builder
         .build_fast_wordpiece_model(
-            vocab, max_bytes_per_token, suffix_indicator, unk_token, False,
+            vocab, max_bytes_per_token, suffix_indicator, unk_token, True,
             False),
         expected_model_buffer)
 
@@ -56,7 +56,7 @@ def test_build_throw_exception_unk_token_not_in_vocab(self):
                                 "Cannot find unk_token in the vocab!"):
       (pywrap_fast_wordpiece_tokenizer_model_builder
        .build_fast_wordpiece_model(
-           vocab, max_bytes_per_token, suffix_indicator, unk_token, False,
+           vocab, max_bytes_per_token, suffix_indicator, unk_token, True,
            False))
 
 
diff --git a/tensorflow_text/python/ops/fast_wordpiece_tokenizer.py b/tensorflow_text/python/ops/fast_wordpiece_tokenizer.py
@@ -67,15 +67,15 @@ def __init__(self,
                max_bytes_per_word=100,
                token_out_type=dtypes.int64,
                unknown_token='[UNK]',
-               end_to_end=False,
+               no_pretokenization=False,
                support_detokenization=False,
                model_buffer=None):
     """Initializes the FastWordpieceTokenizer.
 
     Two ways to initialize:
       * (preferred) use a precompiled `model_buffer`.
       * use `vocab`, `suffix_indicator`, `max_bytes_per_word`, `unknown_token`,
-        and `end_to_end`.
+        and `no_pretokenization`.
 
     Args:
       vocab: (optional) The list of tokens in the vocabulary.
@@ -86,9 +86,9 @@ def __init__(self,
         `tf.int64` or `tf.int32` IDs, or `tf.string` subwords.
       unknown_token: (optional) The string value to substitute for an unknown
         token. It must be included in `vocab`.
-      end_to_end: (optional) Whether to use end-to-end Fast WordPiece tokenizer.
-        When true, the input must be a sentence and we split the input on
-        whitespaces and punctuations.
+      no_pretokenization: (optional) By default, the input is split on
+        whitespaces and punctuations before applying the Wordpiece tokenization.
+        When true, the input is assumed to be pretokenized already.
       support_detokenization: (optional) Whether to make the tokenizer support
         doing detokenization. Setting it to true expands the size of the model
         flatbuffer. As a reference, when using 120k multilingual BERT WordPiece
@@ -105,7 +105,8 @@ def __init__(self,
       model_buffer = (pywrap_fast_wordpiece_tokenizer_model_builder
                       .build_fast_wordpiece_model(
                           vocab, max_bytes_per_word, suffix_indicator,
-                          unknown_token, end_to_end, support_detokenization))
+                          unknown_token, no_pretokenization,
+                          support_detokenization))
     # Use uint8 tensor as a buffer for the model to avoid any possible changes,
     # for example truncation by '\0'.
     self._model = constant_op.constant(list(model_buffer), dtype=dtypes.uint8)
@@ -117,18 +118,18 @@ def tokenize(self, input):  # pylint: disable=redefined-builtin
 
     ### Example 1, single word tokenization:
     >>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
-    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
+    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
+    ...                                    no_pretokenization=True)
     >>> tokens = [["they're", "the", "greatest"]]
     >>> tokenizer.tokenize(tokens)
     <tf.RaggedTensor [[[b'they', b"##'", b'##re'], [b'the'],
                        [b'great', b'##est']]]>
 
-    ### Example 2, general text end-to-end tokenization (pre-tokenization on
+    ### Example 2, general text tokenization (pre-tokenization on
     ### punctuation and whitespace followed by WordPiece tokenization):
     >>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
     ...          "'", "re"]
-    >>> tokenizer = FastWordpieceTokenizer(
-    ...     vocab, token_out_type=tf.string, end_to_end=True)
+    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
     >>> tokens = [["they're the greatest", "the greatest"]]
     >>> tokenizer.tokenize(tokens)
     <tf.RaggedTensor [[[b'they', b"'", b're', b'the', b'great', b'##est'],
@@ -154,7 +155,8 @@ def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
 
     ### Example 1, single word tokenization:
     >>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
-    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
+    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
+    ...                                    no_pretokenization=True)
     >>> tokens = [["they're", "the", "greatest"]]
     >>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
     >>> subtokens
@@ -165,12 +167,11 @@ def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
     >>> ends
     <tf.RaggedTensor [[[4, 5, 7], [3], [5, 8]]]>
 
-    ### Example 2, general text end-to-end tokenization (pre-tokenization on
+    ### Example 2, general text tokenization (pre-tokenization on
     ### punctuation and whitespace followed by WordPiece tokenization):
     >>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
     ...          "'", "re"]
-    >>> tokenizer = FastWordpieceTokenizer(
-    ...     vocab, token_out_type=tf.string, end_to_end=True)
+    >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
     >>> tokens = [["they're the greatest", "the greatest"]]
     >>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
     >>> subtokens
diff --git a/tensorflow_text/python/ops/fast_wordpiece_tokenizer_test.py b/tensorflow_text/python/ops/fast_wordpiece_tokenizer_test.py