Skip to content

Commit e340632

Browse files
brokenManish Mohan
authored and
Manish Mohan
committed
Switch FastWordpieceTokenizer to default to running pre-tokenization, and rename the end_to_end parameter to no_pretokenization. This should be a no-op. The flatbuffer is not changed so as to not affect any models already using FWP currently. Only the python API is updated.
PiperOrigin-RevId: 409197312
1 parent 388db85 commit e340632

7 files changed

+75
-64
lines changed

tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc

+20-17
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,13 @@ class StringVocab : public WordpieceVocab {
169169
// Builds the FastWordpieceTokenizer model.
170170
class FastWordpieceBuilder {
171171
public:
172-
// When end_to_end is true, we split the input string by punctuation chars
173-
// (in addition to whitespaces) and then tokenize it to wordpieces.
172+
// When no_pretokenization is false, we split the input string by punctuation
173+
// chars (in addition to whitespaces) and then tokenize it to wordpieces.
174174
absl::Status BuildModel(const std::vector<std::string>& vocab,
175175
int max_bytes_per_token,
176176
absl::string_view suffix_indicator,
177-
absl::string_view unk_token, bool end_to_end,
177+
absl::string_view unk_token,
178+
bool no_pretokenization,
178179
bool support_detokenization);
179180

180181
absl::StatusOr<std::string> ExportToFlatBuffer() const;
@@ -267,10 +268,10 @@ class FastWordpieceBuilder {
267268
uint32_t trie_punct_failure_link_node_ =
268269
fast_wordpiece_tokenizer_utils::kNullNode;
269270

270-
// Whether to build the end-to-end tokenizer that tokenize general texts. It
271-
// splits the input on punctuation/whitespace and treat each punctuation as an
272-
// independent word.
273-
bool end_to_end_;
271+
// Whether to build the end-to-end tokenizer that tokenizes general texts.
272+
// When set to false, it splits the input on punctuation/whitespace and treat
273+
// each punctuation as an independent word.
274+
bool no_pretokenization_;
274275

275276
// Whether the tokenizer supports the detokenization function.
276277
bool support_detokenization_;
@@ -294,11 +295,11 @@ class FastWordpieceBuilder {
294295
absl::Status FastWordpieceBuilder::BuildModel(
295296
const std::vector<std::string>& vocab, int max_bytes_per_token,
296297
absl::string_view suffix_indicator, absl::string_view unk_token,
297-
bool end_to_end, bool support_detokenization) {
298+
bool no_pretokenization, bool support_detokenization) {
298299
unk_token_ = std::string(unk_token);
299300
suffix_indicator_ = std::string(suffix_indicator);
300301
max_bytes_per_token_ = max_bytes_per_token;
301-
end_to_end_ = end_to_end;
302+
no_pretokenization_ = no_pretokenization;
302303
support_detokenization_ = support_detokenization;
303304

304305
vocab_.emplace(vocab);
@@ -397,7 +398,7 @@ FastWordpieceBuilder::PrepareVocabTokensToBuildTrie() {
397398
}
398399
// Skip word that contains punctuation but is not a punctuation itself.
399400
// <unk>, <pad>, ##. are skipped in this step.
400-
if (end_to_end_ && vocab_token.ContainsPunctuation() &&
401+
if (!no_pretokenization_ && vocab_token.ContainsPunctuation() &&
401402
(vocab_token.TokenUnicodeLengthWithoutSuffixIndicator() > 1 ||
402403
vocab_token.IsSuffixToken())) {
403404
continue;
@@ -425,7 +426,7 @@ FastWordpieceBuilder::PrepareVocabTokensToBuildTrie() {
425426
}
426427
}
427428

428-
if (end_to_end_) {
429+
if (!no_pretokenization_) {
429430
// Special treatment for all Unicode punctuation chars that are not already
430431
// in the trie.
431432
// The maximum codepoint in Unicode is 0x0010FFFF.
@@ -490,7 +491,7 @@ absl::Status FastWordpieceBuilder::ConstructTrie(
490491
}
491492
trie_suffix_root_ = node.node_id;
492493

493-
if (end_to_end_) {
494+
if (!no_pretokenization_) {
494495
// Locate the dummy node for the failure link for punctuation nodes.
495496
node = trie_->CreateTraversalCursorPointToRoot();
496497
if (!trie_->TryTraverseSeveralSteps(node,
@@ -616,7 +617,8 @@ absl::Status FastWordpieceBuilder::BuildFailureStructure(
616617
"Failed to find if an end node in the trie is a punctuation char "
617618
"in node_id_is_punc_map_. It should never happen.");
618619
}
619-
if (end_to_end_ && node_id_is_punc_map_.at(child_node.node_id)) {
620+
if (!no_pretokenization_ &&
621+
node_id_is_punc_map_.at(child_node.node_id)) {
620622
// For end-to-end tokenizer, we set the failure link node of every
621623
// punctuation char as a special node trie_punct_failure_link_node_
622624
// which is a dummy node (no parent, no descendants, failure link is
@@ -718,7 +720,7 @@ absl::Status FastWordpieceBuilder::BuildFailureStructure(
718720
}
719721
}
720722

721-
if (end_to_end_ && !suffix_indicator_.empty()) {
723+
if (!no_pretokenization_ && !suffix_indicator_.empty()) {
722724
// Rewire trie links along suffix_indicator_.
723725
// If the suffix indicator contains a punctuation char, let `u`--(`c`)-->`v`
724726
// be the first trie edge along the suffix indicator such that the edge
@@ -937,7 +939,7 @@ absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
937939
wtcb.add_unk_token_id(unk_token_id_);
938940
wtcb.add_precomputed_result_for_suffix_indicator(
939941
precomputed_result_for_suffix_indicator);
940-
wtcb.add_end_to_end(end_to_end_);
942+
wtcb.add_end_to_end(!no_pretokenization_);
941943
wtcb.add_support_detokenization(support_detokenization_);
942944
wtcb.add_vocab_array(vocab_array);
943945
wtcb.add_vocab_is_suffix_array(vocab_is_suffix_array);
@@ -950,10 +952,11 @@ absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
950952
absl::StatusOr<std::string> BuildModelAndExportToFlatBuffer(
951953
const std::vector<std::string>& vocab, int max_bytes_per_token,
952954
absl::string_view suffix_indicator, absl::string_view unk_token,
953-
bool end_to_end, bool support_detokenization) {
955+
bool no_pretokenization, bool support_detokenization) {
954956
FastWordpieceBuilder builder;
955957
SH_RETURN_IF_ERROR(builder.BuildModel(vocab, max_bytes_per_token,
956-
suffix_indicator, unk_token, end_to_end,
958+
suffix_indicator, unk_token,
959+
no_pretokenization,
957960
support_detokenization));
958961
SH_ASSIGN_OR_RETURN(std::string flatbuffer, builder.ExportToFlatBuffer());
959962
return flatbuffer;

tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ namespace text {
3232
// * suffix_indicator: Characters prepended to a wordpiece to indicate that
3333
// it is a suffix to another subword, such as "##".
3434
// * unk_token: The unknown token string.
35-
// * end_to_end: Whether to build end-to-end tokneizer.
36-
// Set to `true` when the model is used for general text end-to-end
35+
// * no_pretokenization: Whether to pretokenize on punctuation & whitespace.
36+
// Set to `false` when the model is used for general text end-to-end
3737
// tokenization, which combines pre-tokenization (splitting text into words
3838
// on punctuation/whitespaces) and WordPiece (breaking words into subwords)
3939
// into one pass.
@@ -46,7 +46,7 @@ namespace text {
4646
absl::StatusOr<std::string> BuildModelAndExportToFlatBuffer(
4747
const std::vector<std::string>& vocab, int max_bytes_per_token,
4848
absl::string_view suffix_indicator, absl::string_view unk_token,
49-
bool end_to_end = false, bool support_detokenization = false);
49+
bool no_pretokenization = false, bool support_detokenization = false);
5050
} // namespace text
5151
} // namespace tensorflow
5252

tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc

+12-11
Original file line numberDiff line numberDiff line change
@@ -1143,7 +1143,8 @@ TEST_P(TestTokenizeSingleWord, Test) {
11431143
ASSERT_OK_AND_ASSIGN(
11441144
std::string flatbuffer,
11451145
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
1146-
spec.suffix_indicator, spec.unk_token));
1146+
spec.suffix_indicator, spec.unk_token,
1147+
/*no_pretokenization=*/true));
11471148
ASSERT_OK_AND_ASSIGN(auto tokenizer,
11481149
FastWordpieceTokenizer::Create(flatbuffer.data()));
11491150

@@ -1164,7 +1165,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPieces) {
11641165
ASSERT_OK_AND_ASSIGN(
11651166
std::string flatbuffer,
11661167
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
1167-
spec.suffix_indicator, spec.unk_token));
1168+
spec.suffix_indicator, spec.unk_token,
1169+
true /* no_pretokenization */));
11681170
ASSERT_OK_AND_ASSIGN(auto tokenizer,
11691171
FastWordpieceTokenizer::Create(flatbuffer.data()));
11701172

@@ -1183,7 +1185,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesOnlyOutputIds) {
11831185
ASSERT_OK_AND_ASSIGN(
11841186
std::string flatbuffer,
11851187
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
1186-
spec.suffix_indicator, spec.unk_token));
1188+
spec.suffix_indicator, spec.unk_token,
1189+
true /* no_pretokenization */));
11871190
ASSERT_OK_AND_ASSIGN(auto tokenizer,
11881191
FastWordpieceTokenizer::Create(flatbuffer.data()));
11891192

@@ -1198,7 +1201,8 @@ TEST_P(TestTokenizeSingleWord, TestNoOutputPiecesWithPositiveSentenceOffsets) {
11981201
ASSERT_OK_AND_ASSIGN(
11991202
std::string flatbuffer,
12001203
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
1201-
spec.suffix_indicator, spec.unk_token));
1204+
spec.suffix_indicator, spec.unk_token,
1205+
true /* no_pretokenization */));
12021206
ASSERT_OK_AND_ASSIGN(auto tokenizer,
12031207
FastWordpieceTokenizer::Create(flatbuffer.data()));
12041208

@@ -2365,8 +2369,7 @@ TEST_P(TestTokenizeText, Test) {
23652369
ASSERT_OK_AND_ASSIGN(
23662370
std::string flatbuffer,
23672371
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
2368-
spec.suffix_indicator, spec.unk_token,
2369-
/*end_to_end=*/true));
2372+
spec.suffix_indicator, spec.unk_token));
23702373
ASSERT_OK_AND_ASSIGN(auto tokenizer,
23712374
FastWordpieceTokenizer::Create(flatbuffer.data()));
23722375

@@ -2387,8 +2390,7 @@ TEST_P(TestTokenizeText, TestNoOutputPieces) {
23872390
ASSERT_OK_AND_ASSIGN(
23882391
std::string flatbuffer,
23892392
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
2390-
spec.suffix_indicator, spec.unk_token,
2391-
/*end_to_end=*/true));
2393+
spec.suffix_indicator, spec.unk_token));
23922394
ASSERT_OK_AND_ASSIGN(auto tokenizer,
23932395
FastWordpieceTokenizer::Create(flatbuffer.data()));
23942396

@@ -2407,8 +2409,7 @@ TEST_P(TestTokenizeText, TestNoOutputPiecesOnlyOutputIds) {
24072409
ASSERT_OK_AND_ASSIGN(
24082410
std::string flatbuffer,
24092411
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
2410-
spec.suffix_indicator, spec.unk_token,
2411-
/*end_to_end=*/true));
2412+
spec.suffix_indicator, spec.unk_token));
24122413
ASSERT_OK_AND_ASSIGN(auto tokenizer,
24132414
FastWordpieceTokenizer::Create(flatbuffer.data()));
24142415

@@ -2466,7 +2467,7 @@ TEST_P(TestTokenizeDetokenize, Test) {
24662467
std::string flatbuffer,
24672468
BuildModelAndExportToFlatBuffer(spec.vocab, spec.max_bytes_per_token,
24682469
spec.suffix_indicator, spec.unk_token,
2469-
/*end_to_end=*/false,
2470+
/*no_pretokenization=*/true,
24702471
/*support_detokenization=*/true));
24712472
ASSERT_OK_AND_ASSIGN(auto tokenizer,
24722473
FastWordpieceTokenizer::Create(flatbuffer.data()));

tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@ PYBIND11_MODULE(pywrap_fast_wordpiece_tokenizer_model_builder, m) {
2727
m.def("build_fast_wordpiece_model",
2828
[](const std::vector<std::string>& vocab, int max_bytes_per_token,
2929
const std::string& suffix_indicator, const std::string& unk_token,
30-
bool end_to_end, bool support_detokenization) {
30+
bool no_pretokenization, bool support_detokenization) {
3131
const auto result = BuildModelAndExportToFlatBuffer(
3232
vocab, max_bytes_per_token, suffix_indicator, unk_token,
33-
end_to_end, support_detokenization);
33+
no_pretokenization, support_detokenization);
3434
if (!result.status().ok()) {
3535
// Propagate the error to the Python code.
3636
throw std::runtime_error(std::string(result.status().message()));

tensorflow_text/core/pybinds/pywrap_fast_wordpiece_tokenizer_model_builder_test.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_build(self):
4141
self.assertEqual(
4242
pywrap_fast_wordpiece_tokenizer_model_builder
4343
.build_fast_wordpiece_model(
44-
vocab, max_bytes_per_token, suffix_indicator, unk_token, False,
44+
vocab, max_bytes_per_token, suffix_indicator, unk_token, True,
4545
False),
4646
expected_model_buffer)
4747

@@ -56,7 +56,7 @@ def test_build_throw_exception_unk_token_not_in_vocab(self):
5656
"Cannot find unk_token in the vocab!"):
5757
(pywrap_fast_wordpiece_tokenizer_model_builder
5858
.build_fast_wordpiece_model(
59-
vocab, max_bytes_per_token, suffix_indicator, unk_token, False,
59+
vocab, max_bytes_per_token, suffix_indicator, unk_token, True,
6060
False))
6161

6262

tensorflow_text/python/ops/fast_wordpiece_tokenizer.py

+15-14
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,15 @@ def __init__(self,
6767
max_bytes_per_word=100,
6868
token_out_type=dtypes.int64,
6969
unknown_token='[UNK]',
70-
end_to_end=False,
70+
no_pretokenization=False,
7171
support_detokenization=False,
7272
model_buffer=None):
7373
"""Initializes the FastWordpieceTokenizer.
7474
7575
Two ways to initialize:
7676
* (preferred) use a precompiled `model_buffer`.
7777
* use `vocab`, `suffix_indicator`, `max_bytes_per_word`, `unknown_token`,
78-
and `end_to_end`.
78+
and `no_pretokenization`.
7979
8080
Args:
8181
vocab: (optional) The list of tokens in the vocabulary.
@@ -86,9 +86,9 @@ def __init__(self,
8686
`tf.int64` or `tf.int32` IDs, or `tf.string` subwords.
8787
unknown_token: (optional) The string value to substitute for an unknown
8888
token. It must be included in `vocab`.
89-
end_to_end: (optional) Whether to use end-to-end Fast WordPiece tokenizer.
90-
When true, the input must be a sentence and we split the input on
91-
whitespaces and punctuations.
89+
no_pretokenization: (optional) By default, the input is split on
90+
whitespaces and punctuations before applying the Wordpiece tokenization.
91+
When true, the input is assumed to be pretokenized already.
9292
support_detokenization: (optional) Whether to make the tokenizer support
9393
doing detokenization. Setting it to true expands the size of the model
9494
flatbuffer. As a reference, when using 120k multilingual BERT WordPiece
@@ -105,7 +105,8 @@ def __init__(self,
105105
model_buffer = (pywrap_fast_wordpiece_tokenizer_model_builder
106106
.build_fast_wordpiece_model(
107107
vocab, max_bytes_per_word, suffix_indicator,
108-
unknown_token, end_to_end, support_detokenization))
108+
unknown_token, no_pretokenization,
109+
support_detokenization))
109110
# Use uint8 tensor as a buffer for the model to avoid any possible changes,
110111
# for example truncation by '\0'.
111112
self._model = constant_op.constant(list(model_buffer), dtype=dtypes.uint8)
@@ -117,18 +118,18 @@ def tokenize(self, input): # pylint: disable=redefined-builtin
117118
118119
### Example 1, single word tokenization:
119120
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
120-
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
121+
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
122+
... no_pretokenization=True)
121123
>>> tokens = [["they're", "the", "greatest"]]
122124
>>> tokenizer.tokenize(tokens)
123125
<tf.RaggedTensor [[[b'they', b"##'", b'##re'], [b'the'],
124126
[b'great', b'##est']]]>
125127
126-
### Example 2, general text end-to-end tokenization (pre-tokenization on
128+
### Example 2, general text tokenization (pre-tokenization on
127129
### punctuation and whitespace followed by WordPiece tokenization):
128130
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
129131
... "'", "re"]
130-
>>> tokenizer = FastWordpieceTokenizer(
131-
... vocab, token_out_type=tf.string, end_to_end=True)
132+
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
132133
>>> tokens = [["they're the greatest", "the greatest"]]
133134
>>> tokenizer.tokenize(tokens)
134135
<tf.RaggedTensor [[[b'they', b"'", b're', b'the', b'great', b'##est'],
@@ -154,7 +155,8 @@ def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin
154155
155156
### Example 1, single word tokenization:
156157
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
157-
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
158+
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
159+
... no_pretokenization=True)
158160
>>> tokens = [["they're", "the", "greatest"]]
159161
>>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
160162
>>> subtokens
@@ -165,12 +167,11 @@ def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin
165167
>>> ends
166168
<tf.RaggedTensor [[[4, 5, 7], [3], [5, 8]]]>
167169
168-
### Example 2, general text end-to-end tokenization (pre-tokenization on
170+
### Example 2, general text tokenization (pre-tokenization on
169171
### punctuation and whitespace followed by WordPiece tokenization):
170172
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
171173
... "'", "re"]
172-
>>> tokenizer = FastWordpieceTokenizer(
173-
... vocab, token_out_type=tf.string, end_to_end=True)
174+
>>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
174175
>>> tokens = [["they're the greatest", "the greatest"]]
175176
>>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
176177
>>> subtokens

0 commit comments

Comments
 (0)