Skip to content

Commit

Permalink
Silence warning about cudf's subword tokenizer behaving differently t…
Browse files Browse the repository at this point in the history
…han huggingface
  • Loading branch information
dagardner-nv committed Jan 9, 2024
1 parent 5462726 commit 874776f
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions morpheus/stages/preprocess/preprocess_nlp_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import typing
import warnings
from functools import partial

import mrc
Expand Down Expand Up @@ -148,13 +149,18 @@ def pre_process_batch(x: MultiMessage,
"""
text_ser = cudf.Series(x.get_meta(column))

tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file,
do_lower_case=do_lower_case,
text_ser=text_ser,
seq_len=seq_len,
stride=stride,
truncation=truncation,
add_special_tokens=add_special_tokens)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="When truncation is not True, the behavior currently differs from HuggingFace.*",
category=UserWarning)
tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file,
do_lower_case=do_lower_case,
text_ser=text_ser,
seq_len=seq_len,
stride=stride,
truncation=truncation,
add_special_tokens=add_special_tokens)
del text_ser

seg_ids = tokenized.segment_ids
Expand Down

0 comments on commit 874776f

Please sign in to comment.