Merge branch 'newest_mlora' of https://github.com/cainiaogoroad/mLoRA …

…into newest_mlora
TUDB-Labs · Jun 27, 2024 · 3aca4fe · 3aca4fe
2 parents 07e2d60 + a74889d
commit 3aca4fe
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 4 deletions.
diff --git a/mlora/executor/task/cpo_task.py b/mlora/executor/task/cpo_task.py
@@ -48,7 +48,7 @@ def data(self, start_idx: int) -> Tuple[List[Tokens], List[MLoRADataConfig]]:
 
         # convert the string to tokens
         ret_tokens = list(map(lambda raw_str: self.tokenizer_.encode(
-            raw_str, bos=True, eos=True), batch_str))
+            raw_str, bos=True, eos=True, cutoff_len=self.config_.cutoff_len_), batch_str))
         end_idx = start_idx + len(ret_tokens)
 
         def loss_fn(input: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:

diff --git a/mlora/executor/task/dpo_task.py b/mlora/executor/task/dpo_task.py
@@ -89,7 +89,7 @@ def data(self, start_idx: int) -> Tuple[List[Tokens], List[MLoRADataConfig]]:
         ret_tokens = []
         # for refrerence
         ref_model_token = list(map(lambda raw_str: self.tokenizer_.encode(
-            raw_str, bos=True, eos=True), batch_str))
+            raw_str, bos=True, eos=True, cutoff_len=self.config_.cutoff_len_), batch_str))
         policy_model_token = copy.deepcopy(ref_model_token)
 
         ret_tokens.extend(ref_model_token)

diff --git a/mlora/executor/task/train_task.py b/mlora/executor/task/train_task.py
@@ -54,7 +54,7 @@ def data(self, start_idx: int) -> Tuple[List[Tokens], List[MLoRADataConfig]]:
 
         # convert the string to tokens
         ret_tokens = list(map(lambda raw_str: self.tokenizer_.encode(
-            raw_str, bos=True, eos=True), batch_str))
+            raw_str, bos=True, eos=True, cutoff_len=self.config_.cutoff_len_), batch_str))
         end_idx = start_idx + len(ret_tokens)
 
         def loss_fn(input: torch.Tensor, target: torch.Tensor, _: torch.Tensor) -> torch.Tensor:

diff --git a/mlora/model/tokenizer/tokenizer.py b/mlora/model/tokenizer/tokenizer.py
@@ -17,8 +17,9 @@ def __init__(self, model_path: str):
         if self.pad_id_ is None and self.unk_id_ is not None:
             self.pad_id_ = self.unk_id_
 
-    def encode(self, data: str, bos=True, eos=True) -> Tokens:
+    def encode(self, data: str, bos=True, eos=True, cutoff_len=4096) -> Tokens:
         tokens = self.tokenizer_.encode(data, add_special_tokens=False)
+        tokens = tokens[:cutoff_len - int(bos) - int(eos)]
         if bos:
             tokens = [self.bos_id_] + tokens
         if eos: