Skip to content

Commit

Permalink
Merge in main
Browse files Browse the repository at this point in the history
  • Loading branch information
VibhuJawa committed Aug 27, 2024
2 parents 7e20f5e + 9e52a90 commit 3ff73f3
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
1 change: 0 additions & 1 deletion crossfit/backend/torch/op/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def __init__(
def call(self, data, partition_info=None):
# Get the current CUDA device
current_device = torch.cuda.current_device()

# Print CUDA memory at the beginning of the method
print(f"CUDA memory at start (device {current_device}):")
print(torch.cuda.memory_summary(device=current_device))
Expand Down
3 changes: 3 additions & 0 deletions crossfit/op/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import cudf
import cupy as cp
import pandas as pd
import torch
from cudf.core.subword_tokenizer import SubwordTokenizer, _cast_to_appropriate_type
from cudf.utils.hash_vocab_utils import hash_vocab
Expand Down Expand Up @@ -63,6 +64,8 @@ def tokenize_strings(self, sentences, max_length=None):

if isinstance(sentences, cudf.Series):
sentences = sentences.to_arrow().to_pylist()
elif isinstance(sentences, pd.Series):
sentences = sentences.to_list()

with torch.no_grad():
tokenized_data = tokenizer.batch_encode_plus(
Expand Down
11 changes: 11 additions & 0 deletions tests/op/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
cp = pytest.importorskip("cupy")
cudf = pytest.importorskip("cudf")
dask_cudf = pytest.importorskip("dask_cudf")
dd = pytest.importorskip("dask.dataframe")
pd = pytest.importorskip("pandas")
transformers = pytest.importorskip("transformers")
torch = pytest.importorskip("torch")

Expand Down Expand Up @@ -144,3 +146,12 @@ def test_clip_tokens_no_clipping_needed():
assert result["attention_mask"].shape == (2, 3)
assert torch.equal(result["input_ids"].to("cpu"), torch.tensor([[1, 2, 3], [4, 5, 6]]))
assert torch.equal(result["attention_mask"].to("cpu"), torch.tensor([[1, 1, 1], [1, 1, 1]]))


def test_tokenize_strings_cpu(model_name="microsoft/deberta-v3-base"):
model = cf.HFModel(model_name)
tokenizer = op.Tokenizer(model, cols=["text"], tokenizer_type="spm")
input_strings = ["hello world", "this is a sentence"]
ddf = dd.from_pandas(pd.DataFrame({"text": input_strings}), npartitions=1)
results = tokenizer(ddf)
results = results.compute()

0 comments on commit 3ff73f3

Please sign in to comment.