opea-project · chensuyue · Aug 15, 2025 · Jul 7, 2025 · Jul 9, 2025 · Jul 9, 2025
@@ -20,9 +20,9 @@ ENV PATH=$PATH:/home/user/.local/bin
 
 ARG uvpip='uv pip install --system --no-cache-dir'
 RUN python -m pip install --no-cache-dir --upgrade pip setuptools uv && \
-    python -m $uvpip torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
-    python -m $uvpip intel-extension-for-pytorch && \
-    python -m $uvpip oneccl_bind_pt --index-strategy unsafe-best-match --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python -m $uvpip oneccl_bind_pt==2.7.0+cpu --index-strategy unsafe-best-match --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python -m $uvpip torch==2.7.0+cpu torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+    python -m $uvpip intel-extension-for-pytorch==2.7.0 && \
     python -m $uvpip -r /home/user/comps/finetuning/src/requirements-cpu.txt
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user

@@ -252,6 +252,42 @@ Below is an example for the format of the DPO dataset:
 {"system": "You are an AI assistant. You will be given a task. You must generate a detailed and long answer.", "question": "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "chosen": "Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.", "rejected": " Sure! Here's a sentence that describes all the data you provided:\n\n\"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes.\""}
 ```
 
+#### 3.2.6 Reasoning model finetuning
+
+Use the following command to launch a job for finetuning reasoning model, such as `Qwen/Qwen2.5-7B`:
+
+```bash
+# create a finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "medical_o1_sft.json",
+    "model": "Qwen/Qwen2.5-3B",
+    "Dataset": {
+      "max_length":2048
+    },
+    "Training":{
+      "epochs":1,
+      "batch_size":1,
+      "gradient_accumulation_steps":8
+    },
+    "General": {
+      "task":"reasoning"
+    }
+  }'
+```
+
+Below is an example for the format of the reasoning dataset [FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT):
+
+```json
+{
+  "Question": "Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?",
+  "Complex_CoT": "Okay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.\n\nSo, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?\n\nOh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the heart - like a hole that shouldn't be there.\n\nLet's put this together: if a blood clot from the leg somehow travels to the left side of the heart, it could shoot off to the brain and cause that sudden weakness by blocking blood flow there.\n\nHmm, but how would the clot get from the right side of the heart to the left without going through the lungs and getting filtered out?\n\nHere's where our cardiac anomaly comes in: a patent foramen ovale or PFO. That's like a sneaky little shortcut in the heart between the right and left atria.\n\nAnd it's actually pretty common, found in about a quarter of adults, which definitely makes it the top suspect here.\n\nSo with all these pieces - long travel, leg clot, sudden weakness - a PFO fits the bill perfectly, letting a clot cross over and cause all this.\n\nEverything fits together pretty neatly, so I'd bet PFO is the heart issue waiting to be discovered. Yeah, that really clicks into place!",
+  "Response": "The specific cardiac abnormality most likely to be found in this scenario is a patent foramen ovale (PFO). This condition could allow a blood clot from the venous system, such as one from a deep vein thrombosis in the leg, to bypass the lungs and pass directly into the arterial circulation. This can occur when the clot moves from the right atrium to the left atrium through the PFO. Once in the arterial system, the clot can travel to the brain, potentially causing an embolic stroke, which would explain the sudden weakness in the left arm and leg. The connection between the recent travel, which increases the risk of deep vein thrombosis, and the neurological symptoms suggests the presence of a PFO facilitating a paradoxical embolism."
+}
+```
+
 ### 3.3 Manage fine-tuning job
 
 Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.

@@ -74,7 +74,7 @@ def check_report_to(cls, v: str):
 
     @validator("task")
     def check_task(cls, v: str):
-        assert v in ["instruction_tuning", "pretraining", "dpo", "rerank", "embedding"]
+        assert v in ["instruction_tuning", "pretraining", "dpo", "rerank", "embedding", "reasoning"]
         return v
 
 
@@ -116,6 +116,9 @@ class DatasetConfig(BaseModel):
     )
     query_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for query")
     passage_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for passage")
+    reasoning_dataset_keys: Optional[List[str]] = Field(
+        default=["Question", "Complex_CoT", "Response"], description="keys of reasoning dataset"
+    )
 
 
 class RayResourceConfig(BaseModel):

@@ -44,8 +44,8 @@ def make_prompt(self, examples):
         prompts["prompt_targets"] = []
         for rec in examples:
             instruction = rec["instruction"]
-            response = rec["input"]
-            context = rec.get("output")
+            context = rec["input"]
+            response = rec.get("output")
             if not instruction:
                 raise ValueError(f"Expected an instruction in: {rec}")
             # if not response:
@@ -193,7 +193,94 @@ def tokenize(self, examples):
                 if self.mask_input:
                     labels[:input_id_len] = [IGNORE_INDEX] * input_id_len
                 # mask response
-                if self.mask_response:
+                elif self.mask_response:
+                    labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len)
+
+            examples["input_ids"].append(results["input_ids"])
+            examples["labels"].append(labels)
+            examples["attention_mask"].append(results["attention_mask"])
+        return examples
+
+
+class ReasoningDataProcessor:
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.think_tokens = ["<think>", "</think>"]
+        tokenizer.add_special_tokens({"additional_special_tokens": self.think_tokens})
+        tokenizer.pad_token = tokenizer.eos_token
+        self.system = """
+You are an assistant that engages in extremely thorough, self-questioning reasoning. You will help the user to answer the question they propose.
+Your responses must be given after the thorough and rigorous reasoning, and output your reasoning content between <think> and </think> tags.
+Make sure to always include the final answer, and output the final answer after the </think> tag.
+"""
+        self.keys = config["Dataset"].get("reasoning_dataset_keys", ["Question", "Complex_CoT", "Response"])
+        assert len(self.keys) >= 2, "dataset must have 2 keys or more."
+        self.padding_side = config["Dataset"].get("padding_side", "right")
+        self.truncation_side = config["Dataset"].get("truncation_side", "right")
+        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 2048)
+        self.truncation = config["Dataset"].get("truncation", True)
+        # set padding to max_length for hpu to avoid bug in GaudiTrainer and accelerate training on hpu
+        self.padding = config["Dataset"].get("padding", True) if config["Training"]["device"] != "hpu" else "max_length"
+        self.mask_input = config["Dataset"].get("mask_input", True)
+        self.mask_response = config["Dataset"].get("mask_response", True)
+
+    def make_prompt(self, examples):
+        prompts = {}
+        prompts["prompt_sources"] = []
+        prompts["prompt_targets"] = []
+        for rec in examples:
+            for key in self.keys:
+                assert key in rec, f"Key {key} not in dataset, provide correct keys in reasoning_dataset_keys argument."
+            question = rec[self.keys[0]]
+            reasoning = rec[self.keys[1]] if len(self.keys) == 3 else ""
+            response = rec[self.keys[2]] if len(self.keys) == 3 else rec[self.keys[1]]
+            if not question:
+                raise ValueError(f"Expected a question in: {rec}")
+            prompt = self.system + "\n" + "### User" + "\n" + question + "\n" + "### Assistant" + "\n"
+            prompts["prompt_sources"].append(prompt)
+            if reasoning:
+                prompt_response = (
+                    self.think_tokens[0] + reasoning + self.think_tokens[1] + "\n" + response + self.tokenizer.eos_token
+                )
+            else:
+                prompt_response = response + self.tokenizer.eos_token
+            prompts["prompt_targets"].append(prompt_response)
+        return prompts
+
+    def tokenize(self, examples):
+        keys = list(examples.data.keys())
+        if len(keys) != 2:
+            raise ValueError("Unsupported dataset format")
+
+        examples["input_ids"] = []
+        examples["labels"] = []
+        examples["attention_mask"] = []
+        for s, t in zip(examples[keys[0]], examples[keys[1]]):
+            results = self.tokenizer(
+                s + t,
+                padding=self.padding,
+                truncation=self.truncation,
+                return_tensors=None,
+                max_length=self.max_length,
+            )
+
+            input_ids = results["input_ids"]
+            input_len = len(input_ids)
+            labels = copy.deepcopy(input_ids)
+            if self.mask_input or self.mask_response:
+                sources_tokenized = self.tokenizer(
+                    s,
+                    padding=False,
+                    truncation=True,
+                    return_tensors=None,
+                    max_length=self.max_length,
+                )
+                input_id_len = len(sources_tokenized["input_ids"])
+                # mask input
+                if self.mask_input:
+                    labels[:input_id_len] = [IGNORE_INDEX] * input_id_len
+                # mask response
+                elif self.mask_response:
                     labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len)
 
             examples["input_ids"].append(results["input_ids"])

@@ -32,6 +32,7 @@
     GroupCollator,
     InstructionDataProcessor,
     PretrainingDataProcessor,
+    ReasoningDataProcessor,
     TrainDatasetForCE,
     TrainDatasetForEmbedding,
 )
@@ -301,6 +302,21 @@ def group_texts(examples):
             desc="Tokenize dataset",
         )
         return tokenized_dataset
+    elif task == "reasoning":
+        processor = ReasoningDataProcessor(config, tokenizer)
+        for key in dataset:
+            prompts = processor.make_prompt(dataset[key])
+            dataset[key] = datasets.Dataset.from_dict(prompts)
+
+        column_names = list(dataset["train"].features)
+        tokenized_dataset = dataset.map(
+            processor.tokenize,
+            remove_columns=column_names,
+            batched=True,
+            load_from_cache_file=False,
+            desc="Tokenize dataset",
+        )
+        return tokenized_dataset
     elif task == "rerank":
         dataset["train"] = TrainDatasetForCE(dataset["train"], config["Dataset"], tokenizer)
         return dataset
@@ -313,10 +329,14 @@ def group_texts(examples):
 
 def prepare_data_collator(config: Dict, tokenizer):
     task = config["General"].get("task", "instruction_tuning")
-    if task == "instruction_tuning" or task == "pretraining":
+    if task in ["instruction_tuning", "pretraining"]:
         return transformers.DataCollatorForLanguageModeling(
             tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
         )
+    elif task == "reasoning":
+        return transformers.DataCollatorForSeq2Seq(
+            tokenizer=tokenizer, max_length=config["Dataset"]["max_length"], return_tensors="pt"
+        )
     elif task == "dpo":
         return DPOCollator(tokenizer)
     elif task == "rerank":
@@ -338,14 +358,14 @@ def load_model(config: Dict):
     model_config = config["General"].get("config", {})
     task = config["General"].get("task", "instruction_tuning")
     ref_model = None
-    if task in ["instruction_tuning", "pretraining", "dpo"]:
+    if task in ["instruction_tuning", "pretraining", "dpo", "reasoning"]:
         model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, **model_config)
         if task == "dpo":
             ref_model = transformers.AutoModelForCausalLM.from_pretrained(
                 model_name, torch_dtype=model_dtype, **model_config
             )
         lora_config = config["General"].get("lora_config", None)
-        if lora_config and task == "instruction_tuning":
+        if lora_config and task in ["instruction_tuning"]:
             peft_config = LoraConfig(**lora_config)
             model = get_peft_model(model, peft_config)
     elif task == "rerank":
@@ -389,6 +409,8 @@ def load_model(config: Dict):
 def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, data_collator):
     device = config["Training"]["device"]
     task = config["General"].get("task", "instruction_tuning")
+    if task == "reasoning":
+        model.resize_token_embeddings(len(tokenizer))
     if device in ["cpu", "gpu", "cuda"]:
         training_args = convert_to_training_args(TrainingArguments, config)
         if task == "dpo":