diff --git a/crossfit/op/tokenize.py b/crossfit/op/tokenize.py index 6a64855..901f754 100644 --- a/crossfit/op/tokenize.py +++ b/crossfit/op/tokenize.py @@ -17,6 +17,7 @@ import cudf import cupy as cp +import pandas as pd import torch from cudf.core.subword_tokenizer import SubwordTokenizer, _cast_to_appropriate_type from cudf.utils.hash_vocab_utils import hash_vocab @@ -63,6 +64,8 @@ def tokenize_strings(self, sentences, max_length=None): if isinstance(sentences, cudf.Series): sentences = sentences.to_arrow().to_pylist() + elif isinstance(sentences, pd.Series): + sentences = sentences.to_list() with torch.no_grad(): tokenized_data = tokenizer.batch_encode_plus( diff --git a/tests/op/test_tokenize.py b/tests/op/test_tokenize.py index 6123208..8c93bca 100644 --- a/tests/op/test_tokenize.py +++ b/tests/op/test_tokenize.py @@ -18,6 +18,8 @@ cp = pytest.importorskip("cupy") cudf = pytest.importorskip("cudf") dask_cudf = pytest.importorskip("dask_cudf") +dd = pytest.importorskip("dask.dataframe") +pd = pytest.importorskip("pandas") transformers = pytest.importorskip("transformers") torch = pytest.importorskip("torch") @@ -144,3 +146,12 @@ def test_clip_tokens_no_clipping_needed(): assert result["attention_mask"].shape == (2, 3) assert torch.equal(result["input_ids"].to("cpu"), torch.tensor([[1, 2, 3], [4, 5, 6]])) assert torch.equal(result["attention_mask"].to("cpu"), torch.tensor([[1, 1, 1], [1, 1, 1]])) + + +def test_tokenize_strings_cpu(model_name="microsoft/deberta-v3-base"): + model = cf.HFModel(model_name) + tokenizer = op.Tokenizer(model, cols=["text"], tokenizer_type="spm") + input_strings = ["hello world", "this is a sentence"] + ddf = dd.from_pandas(pd.DataFrame({"text": input_strings}), npartitions=1) + results = tokenizer(ddf) + results = results.compute()