From e885474c98d3d1bb95967ec8bcc76d047655d496 Mon Sep 17 00:00:00 2001
From: Umair Ahmed <uahmed@nvidia.com>
Date: Mon, 23 Sep 2024 22:54:28 +0530
Subject: [PATCH] Added test_tokenize.py to test get_model_output from Model
 class. Handle some review comments.

Signed-off-by: Ahmed Umair <ahmedumair.1993@gmail.com>
---
 crossfit/backend/torch/model.py |  9 ++---
 examples/custom_ct2_model.py    | 17 ++++++++-
 tests/op/test_model_function.py | 67 +++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 6 deletions(-)
 create mode 100644 tests/op/test_model_function.py

diff --git a/crossfit/backend/torch/model.py b/crossfit/backend/torch/model.py
index b783123..5d39b1a 100644
--- a/crossfit/backend/torch/model.py
+++ b/crossfit/backend/torch/model.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
 import cudf
 import cupy as cp
 from crossfit.backend.cudf.series import (
@@ -24,7 +26,7 @@ class Model:
     def __init__(self, path_or_name: str, max_mem_gb: int = 16, model_output_type: str = "numeric"):
         self.path_or_name = path_or_name
         self.max_mem_gb = max_mem_gb
-        if model_output_type == "numeric" or model_output_type == "string":
+        if model_output_type in ["numeric", "string"]:
             self.model_output_type = model_output_type
         else:
             raise ValueError(
@@ -66,10 +68,7 @@ def get_model_output(self, all_outputs_ls, index, loader, pred_output_col) -> cu
         )
 
         if self.model_output_type == "string":
-            all_outputs = []
-            for output in all_outputs_ls:
-                for o in output:
-                    all_outputs.append(o)
+            all_outputs = [o for output in all_outputs_ls for o in output]
             out[pred_output_col] = cudf.Series(data=all_outputs, index=_index)
             del all_outputs_ls
             del loader
diff --git a/examples/custom_ct2_model.py b/examples/custom_ct2_model.py
index 2f9899b..2e867ce 100644
--- a/examples/custom_ct2_model.py
+++ b/examples/custom_ct2_model.py
@@ -1,3 +1,18 @@
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import argparse
 from dataclasses import dataclass
 from functools import lru_cache
@@ -141,7 +156,7 @@ def main():
         model = ModelForSeq2SeqModel(Config)
         pipe = op.Sequential(
             op.Tokenizer(
-                model, cols=[args.input_column], tokenizer_type="sentencepiece", max_length=255
+                model, cols=[args.input_column], tokenizer_type="default", max_length=255
             ),
             op.Predictor(model, sorted_data_loader=True, batch_size=args.batch_size),
             repartition=args.partitions,
diff --git a/tests/op/test_model_function.py b/tests/op/test_model_function.py
new file mode 100644
index 0000000..ce9aa50
--- /dev/null
+++ b/tests/op/test_model_function.py
@@ -0,0 +1,67 @@
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from unittest.mock import patch
+
+cp = pytest.importorskip("cupy")
+cudf = pytest.importorskip("cudf")
+dask_cudf = pytest.importorskip("dask_cudf")
+dd = pytest.importorskip("dask.dataframe")
+pd = pytest.importorskip("pandas")
+transformers = pytest.importorskip("transformers")
+torch = pytest.importorskip("torch")
+
+import crossfit as cf  # noqa: E402
+
+
+cf_loader = pytest.importorskip("crossfit.backend.torch.loader")
+
+
+@pytest.mark.parametrize("trust_remote_code", ["y"])
+def test_model_output_int(trust_remote_code, model_name="ai4bharat/indictrans2-en-indic-1B"):
+    with patch("builtins.input", return_value=trust_remote_code):
+        tokens_data = cudf.DataFrame({"input_ids": [[11, 12, 13], [14, 15, 16], [17, 18, 19]]})
+        index = tokens_data.index.copy()
+        model = cf.HFModel(model_name)
+        data = [[4], [7], [10]]
+        all_outputs_ls = torch.tensor(data)
+        loader = cf_loader.SortedSeqLoader(
+            tokens_data,
+            model,
+        )
+        pred_output_col = "translation"
+        out = model.get_model_output(all_outputs_ls, index, loader, pred_output_col)
+        assert isinstance(out, cudf.DataFrame)
+        assert isinstance(out["translation"][0][0], int)
+
+
+@pytest.mark.parametrize("trust_remote_code", ["y"])
+def test_model_output_str(trust_remote_code, model_name="ai4bharat/indictrans2-en-indic-1B"):
+    with patch("builtins.input", return_value=trust_remote_code):
+        tokens_data = cudf.DataFrame(
+            {"input_ids": [[18264, 7728, 8], [123, 99, 2258], [3115, 125, 123]]}
+        )
+        index = tokens_data.index.copy()
+        model = cf.HFModel(model_name, model_output_type="string")
+        data = [["▁हमारे▁परीक्षण▁डेटा"], ["▁पर▁हमारे▁दो"], ["▁दूरी▁कार्यों▁की"]]
+        all_outputs_ls = data
+        loader = cf_loader.SortedSeqLoader(
+            tokens_data,
+            model,
+        )
+        pred_output_col = "translation"
+        out = model.get_model_output(all_outputs_ls, index, loader, pred_output_col)
+        assert isinstance(out, cudf.DataFrame)
+        assert isinstance(out["translation"][0][0], str)