@@ -67,15 +67,15 @@ def __init__(self,
67
67
max_bytes_per_word = 100 ,
68
68
token_out_type = dtypes .int64 ,
69
69
unknown_token = '[UNK]' ,
70
- end_to_end = False ,
70
+ no_pretokenization = False ,
71
71
support_detokenization = False ,
72
72
model_buffer = None ):
73
73
"""Initializes the FastWordpieceTokenizer.
74
74
75
75
Two ways to initialize:
76
76
* (preferred) use a precompiled `model_buffer`.
77
77
* use `vocab`, `suffix_indicator`, `max_bytes_per_word`, `unknown_token`,
78
- and `end_to_end `.
78
+ and `no_pretokenization `.
79
79
80
80
Args:
81
81
vocab: (optional) The list of tokens in the vocabulary.
@@ -86,9 +86,9 @@ def __init__(self,
86
86
`tf.int64` or `tf.int32` IDs, or `tf.string` subwords.
87
87
unknown_token: (optional) The string value to substitute for an unknown
88
88
token. It must be included in `vocab`.
89
- end_to_end : (optional) Whether to use end-to-end Fast WordPiece tokenizer.
90
- When true, the input must be a sentence and we split the input on
91
- whitespaces and punctuations .
89
+ no_pretokenization : (optional) By default, the input is split on
90
+ whitespaces and punctuations before applying the Wordpiece tokenization.
91
+ When true, the input is assumed to be pretokenized already .
92
92
support_detokenization: (optional) Whether to make the tokenizer support
93
93
doing detokenization. Setting it to true expands the size of the model
94
94
flatbuffer. As a reference, when using 120k multilingual BERT WordPiece
@@ -105,7 +105,8 @@ def __init__(self,
105
105
model_buffer = (pywrap_fast_wordpiece_tokenizer_model_builder
106
106
.build_fast_wordpiece_model (
107
107
vocab , max_bytes_per_word , suffix_indicator ,
108
- unknown_token , end_to_end , support_detokenization ))
108
+ unknown_token , no_pretokenization ,
109
+ support_detokenization ))
109
110
# Use uint8 tensor as a buffer for the model to avoid any possible changes,
110
111
# for example truncation by '\0'.
111
112
self ._model = constant_op .constant (list (model_buffer ), dtype = dtypes .uint8 )
@@ -117,18 +118,18 @@ def tokenize(self, input): # pylint: disable=redefined-builtin
117
118
118
119
### Example 1, single word tokenization:
119
120
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
120
- >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
121
+ >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
122
+ ... no_pretokenization=True)
121
123
>>> tokens = [["they're", "the", "greatest"]]
122
124
>>> tokenizer.tokenize(tokens)
123
125
<tf.RaggedTensor [[[b'they', b"##'", b'##re'], [b'the'],
124
126
[b'great', b'##est']]]>
125
127
126
- ### Example 2, general text end-to-end tokenization (pre-tokenization on
128
+ ### Example 2, general text tokenization (pre-tokenization on
127
129
### punctuation and whitespace followed by WordPiece tokenization):
128
130
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
129
131
... "'", "re"]
130
- >>> tokenizer = FastWordpieceTokenizer(
131
- ... vocab, token_out_type=tf.string, end_to_end=True)
132
+ >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
132
133
>>> tokens = [["they're the greatest", "the greatest"]]
133
134
>>> tokenizer.tokenize(tokens)
134
135
<tf.RaggedTensor [[[b'they', b"'", b're', b'the', b'great', b'##est'],
@@ -154,7 +155,8 @@ def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin
154
155
155
156
### Example 1, single word tokenization:
156
157
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]"]
157
- >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
158
+ >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string,
159
+ ... no_pretokenization=True)
158
160
>>> tokens = [["they're", "the", "greatest"]]
159
161
>>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
160
162
>>> subtokens
@@ -165,12 +167,11 @@ def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin
165
167
>>> ends
166
168
<tf.RaggedTensor [[[4, 5, 7], [3], [5, 8]]]>
167
169
168
- ### Example 2, general text end-to-end tokenization (pre-tokenization on
170
+ ### Example 2, general text tokenization (pre-tokenization on
169
171
### punctuation and whitespace followed by WordPiece tokenization):
170
172
>>> vocab = ["they", "##'", "##re", "the", "great", "##est", "[UNK]",
171
173
... "'", "re"]
172
- >>> tokenizer = FastWordpieceTokenizer(
173
- ... vocab, token_out_type=tf.string, end_to_end=True)
174
+ >>> tokenizer = FastWordpieceTokenizer(vocab, token_out_type=tf.string)
174
175
>>> tokens = [["they're the greatest", "the greatest"]]
175
176
>>> subtokens, starts, ends = tokenizer.tokenize_with_offsets(tokens)
176
177
>>> subtokens
0 commit comments