Silence warning about cudf's subword tokenizer behaving differently t…

…han huggingface
nv-morpheus · Jan 9, 2024 · 874776f · 874776f
1 parent 5462726
commit 874776f
Showing 1 changed file with 13 additions and 7 deletions.
diff --git a/morpheus/stages/preprocess/preprocess_nlp_stage.py b/morpheus/stages/preprocess/preprocess_nlp_stage.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import typing
+import warnings
 from functools import partial
 
 import mrc
@@ -148,13 +149,18 @@ def pre_process_batch(x: MultiMessage,
         """
         text_ser = cudf.Series(x.get_meta(column))
 
-        tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file,
-                                         do_lower_case=do_lower_case,
-                                         text_ser=text_ser,
-                                         seq_len=seq_len,
-                                         stride=stride,
-                                         truncation=truncation,
-                                         add_special_tokens=add_special_tokens)
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                message="When truncation is not True, the behavior currently differs from HuggingFace.*",
+                category=UserWarning)
+            tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file,
+                                             do_lower_case=do_lower_case,
+                                             text_ser=text_ser,
+                                             seq_len=seq_len,
+                                             stride=stride,
+                                             truncation=truncation,
+                                             add_special_tokens=add_special_tokens)
         del text_ser
 
         seg_ids = tokenized.segment_ids