diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100644 index 0000000000..696cc23c03 --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e + +echo "=== [Step 1] Updating package lists ===" +apt update + +echo "=== [Step 2] Installing basic tools ===" +apt install -y \ + vim \ + git \ + curl \ + gnupg \ + wget \ + unzip \ + htop \ + tmux \ + python3-pip \ + bash-completion + +echo "=== [Step 3] Setting up Python environment ===" +pip3 install --upgrade pip +pip3 install virtualenv ipython +pip install -r requirements.txt + +echo "=== [Step 4] Installing oh-my-bash ===" +if [ ! -d "$HOME/.oh-my-bash" ]; then + git clone https://github.com/ohmybash/oh-my-bash.git ~/.oh-my-bash + cp ~/.oh-my-bash/templates/bashrc.osh-template ~/.bashrc + sed -i 's/^OSH_THEME=.*/OSH_THEME="font"/' ~/.bashrc +fi + +echo "=== [Step 5] Configuring Git ===" +# Replace the values below with your identity if needed +git config --global user.name "Yexi Jiang" +git config --global user.email "2237303+yxjiang@users.noreply.github.com" +git config --global init.defaultBranch main +git config --global core.editor vim +git config --global color.ui auto + +# Soft link ssh and Git setting +# Ensure /workspace exists and is writable +if [ ! -d "/workspace" ]; then + mkdir -p /workspace +fi +chmod u+w /workspace + +mkdir -p ~/.ssh +cp /workspace/bootstrap/config/.ssh/id_ed25519 ~/.ssh/id_ed25519 +cp /workspace/bootstrap/config/.ssh/id_ed25519.pub ~/.ssh/id_ed25519.pub +cp /workspace/bootstrap/config/.ssh/config ~/.ssh/config +cp /workspace/bootstrap/config/.gitconfig ~/.gitconfig + +chmod 600 ~/.ssh/id_ed25519 + +# Start ssh-agent +eval "$(ssh-agent -s)" +ssh-add ~/.ssh/id_ed25519 + +echo "=== [Step 6] Install Ollama ===" +# Install Ollama +curl -fsSL https://ollama.com/install.sh | sh +ollama serve & + +echo "=== [Step 7] Ramp up workspace directory ===" +source .bashrc +cat /workspace/bootstrap/.bashrc >> ~/.bashrc + +echo "=== Done ===" +echo "Run 'source ~/.bashrc' to activate oh-my-bash" +source ~/.bashrc + +# Create folders if not exist +mkdir -p /workspace/models +mkdir -p /workspace/data \ No newline at end of file diff --git a/data/dataset_info.json b/data/dataset_info.json index 1ce4639604..266e89c532 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -714,5 +714,20 @@ "prompt": "content" }, "folder": "python" + }, + "big_reasoning_traces": { + "hf_hub_url": "tech-tao/big-reasoning-traces-100k", + "formatting": "sharegpt", + "subset": "default", + "columns": { + "messages": "messages" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant", + "system_tag": "system" + } } } diff --git a/download_big_reasoning_traces.py b/download_big_reasoning_traces.py new file mode 100644 index 0000000000..4f53c82450 --- /dev/null +++ b/download_big_reasoning_traces.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +""" +Script to download big-reasoning-traces dataset, extract first N records, +and upload to Hugging Face account. + +Usage: + python download_big_reasoning_traces.py --hf_username tech-tao --dataset_name big-reasoning-traces-100k --max_records 10000 --streaming +""" + +import argparse +import logging +from typing import Optional +from datasets import Dataset, load_dataset +from huggingface_hub import HfApi, login +import os + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +def download_and_process_dataset( + source_dataset: str = "allenai/big-reasoning-traces", + subset: str = "DeepSeek", + max_records: int = 100000, + use_streaming: bool = False +) -> Dataset: + """ + Download the big-reasoning-traces dataset and extract first N records. + + Args: + source_dataset: Source dataset name on Hugging Face + subset: Dataset subset to use (DeepSeek, OpenThoughts, etc.) + max_records: Maximum number of records to extract + use_streaming: Use streaming to avoid loading entire dataset in memory + + Returns: + Processed dataset with first N records + """ + logger.info(f"Downloading dataset: {source_dataset}") + logger.info(f"Using subset: {subset}") + logger.info(f"Extracting first {max_records:,} records") + logger.info(f"Streaming mode: {use_streaming}") + + try: + if use_streaming: + # Use streaming for memory-efficient processing + logger.info("Using streaming mode for memory efficiency...") + dataset_iter = load_dataset( + source_dataset, + subset, + split="train", + streaming=True + ) + + # Collect first N records + records = [] + for i, record in enumerate(dataset_iter): + if i >= max_records: + break + records.append(record) + if (i + 1) % 10000 == 0: + logger.info(f"Processed {i + 1:,} records...") + + # Convert to Dataset + dataset = Dataset.from_list(records) + logger.info(f"Collected {len(dataset):,} records via streaming") + + else: + # Load entire dataset (uses more memory) + logger.info("Loading entire dataset into memory...") + dataset = load_dataset(source_dataset, subset, split="train") + logger.info(f"Original dataset size: {len(dataset):,} records") + + # Extract first N records + if len(dataset) > max_records: + dataset = dataset.select(range(max_records)) + logger.info(f"Truncated to {len(dataset):,} records") + else: + logger.warning(f"Dataset has only {len(dataset):,} records, less than requested {max_records:,}") + + # Show dataset structure + logger.info(f"Dataset features: {dataset.features}") + logger.info(f"Sample record keys: {list(dataset[0].keys())}") + + return dataset + + except Exception as e: + logger.error(f"Error downloading dataset: {e}") + raise + + +def upload_to_huggingface( + dataset: Dataset, + username: str, + dataset_name: str, + private: bool = False +) -> str: + """ + Upload dataset to Hugging Face Hub. + + Args: + dataset: Dataset to upload + username: Hugging Face username + dataset_name: Name for the new dataset + private: Whether to make the dataset private + + Returns: + URL of the uploaded dataset + """ + repo_id = f"{username}/{dataset_name}" + + logger.info(f"Uploading dataset to: {repo_id}") + logger.info(f"Dataset size: {len(dataset):,} records") + logger.info(f"Private: {private}") + + try: + # Push to hub + dataset.push_to_hub( + repo_id=repo_id, + private=private, + commit_message="Initial upload: First 100k records from big-reasoning-traces" + ) + + dataset_url = f"https://huggingface.co/datasets/{repo_id}" + logger.info(f"Successfully uploaded dataset to: {dataset_url}") + + return dataset_url + + except Exception as e: + logger.error(f"Error uploading dataset: {e}") + raise + + +def main(): + """Main function to orchestrate the download and upload process.""" + parser = argparse.ArgumentParser( + description="Download big-reasoning-traces dataset and upload subset to Hugging Face" + ) + parser.add_argument( + "--hf_username", + type=str, + required=True, + help="Your Hugging Face username" + ) + parser.add_argument( + "--dataset_name", + type=str, + required=True, + help="Name for your new dataset" + ) + parser.add_argument( + "--max_records", + type=int, + default=100000, + help="Maximum number of records to extract (default: 100000)" + ) + parser.add_argument( + "--subset", + type=str, + default="DeepSeek", + choices=["DeepSeek", "OpenThoughts", "OpenR1-Math"], + help="Dataset subset to use (default: DeepSeek)" + ) + parser.add_argument( + "--private", + action="store_true", + help="Make the uploaded dataset private" + ) + parser.add_argument( + "--hf_token", + type=str, + help="Hugging Face token (or set HF_TOKEN environment variable)" + ) + parser.add_argument( + "--streaming", + action="store_true", + help="Use streaming mode for memory-efficient processing of large datasets" + ) + + args = parser.parse_args() + + # Login to Hugging Face + token = args.hf_token or os.getenv("HF_TOKEN") + if not token: + logger.error("Hugging Face token required. Set HF_TOKEN environment variable or use --hf_token") + return 1 + + try: + login(token=token) + logger.info("Successfully logged in to Hugging Face") + except Exception as e: + logger.error(f"Failed to login to Hugging Face: {e}") + return 1 + + try: + # Download and process dataset + dataset = download_and_process_dataset( + subset=args.subset, + max_records=args.max_records, + use_streaming=args.streaming + ) + + # Upload to Hugging Face + dataset_url = upload_to_huggingface( + dataset=dataset, + username=args.hf_username, + dataset_name=args.dataset_name, + private=args.private + ) + + logger.info("=" * 60) + logger.info("SUCCESS!") + logger.info(f"Dataset uploaded to: {dataset_url}") + logger.info(f"Records: {len(dataset):,}") + logger.info(f"Features: {list(dataset.features.keys())}") + logger.info("=" * 60) + + return 0 + + except Exception as e: + logger.error(f"Process failed: {e}") + return 1 + + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/examples/inference/qwen3.yaml b/examples/inference/qwen3.yaml new file mode 100644 index 0000000000..c3ce4e7588 --- /dev/null +++ b/examples/inference/qwen3.yaml @@ -0,0 +1,4 @@ +model_name_or_path: Qwen/Qwen3-4B +template: qwen3 +infer_backend: huggingface # choices: [huggingface, vllm, sglang] +trust_remote_code: true diff --git a/examples/train_full/README_qwen3_4b_reasoning.md b/examples/train_full/README_qwen3_4b_reasoning.md new file mode 100644 index 0000000000..cfee07b19d --- /dev/null +++ b/examples/train_full/README_qwen3_4b_reasoning.md @@ -0,0 +1,70 @@ +# Qwen3-4B Training Configurations for Big Reasoning Traces + +This directory contains training configurations for fine-tuning Qwen3-4B on the Big Reasoning Traces dataset from AllenAI. + +## Dataset + +- **Source**: [allenai/big-reasoning-traces](https://huggingface.co/datasets/allenai/big-reasoning-traces) +- **Subset**: `deepseek_debug` +- **Format**: Question-Answer pairs for reasoning tasks +- **Dataset Config**: Added to `data/dataset_info.json` + +## Configurations + +### 1. Full Fine-tuning (`qwen3_4b_full_sft.yaml`) +- **Method**: Full parameter fine-tuning +- **Memory**: High (requires significant GPU memory) +- **Learning Rate**: 5.0e-6 (lower for full fine-tuning) +- **Batch Size**: 1 with gradient accumulation of 4 +- **Context Length**: 4096 tokens + +### 2. LoRA Fine-tuning (`qwen3_4b_lora_sft.yaml`) +- **Method**: Low-Rank Adaptation (LoRA) +- **Memory**: Moderate +- **Learning Rate**: 1.0e-4 (higher for LoRA) +- **LoRA Rank**: 16 (higher for reasoning tasks) +- **Batch Size**: 2 with gradient accumulation of 2 + +### 3. QLoRA Fine-tuning (`qwen3_4b_qlora_sft.yaml`) +- **Method**: Quantized LoRA (4-bit quantization) +- **Memory**: Low (most memory efficient) +- **Learning Rate**: 1.0e-4 +- **LoRA Rank**: 16 +- **Batch Size**: 2 with gradient accumulation of 4 + +## Key Features + +- **Thinking Mode**: Enabled (`enable_thinking: true`) for enhanced reasoning capabilities +- **Template**: Uses `qwen3` template optimized for Qwen3 models +- **Context Length**: Increased to 4096 tokens for longer reasoning chains +- **DeepSpeed**: Uses ZeRO-3 for distributed training + +## Usage + +### Full Fine-tuning +```bash +llamafactory-cli train examples/train_full/qwen3_4b_full_sft.yaml +``` + +### LoRA Fine-tuning +```bash +llamafactory-cli train examples/train_lora/qwen3_4b_lora_sft.yaml +``` + +### QLoRA Fine-tuning +```bash +llamafactory-cli train examples/train_qlora/qwen3_4b_qlora_sft.yaml +``` + +## Hardware Requirements + +- **Full Fine-tuning**: 4x A100 80GB or equivalent +- **LoRA**: 2x A100 40GB or equivalent +- **QLoRA**: 1x A100 24GB or equivalent + +## Notes + +- Adjust `max_samples` based on your dataset size and training time constraints +- The `enable_thinking` parameter is crucial for reasoning tasks with Qwen3 +- Consider enabling evaluation by uncommenting the eval section for monitoring training progress +- Output models will be saved in `saves/qwen3-4b/` directory structure \ No newline at end of file diff --git a/examples/train_full/qwen3_4b_full_sft.yaml b/examples/train_full/qwen3_4b_full_sft.yaml new file mode 100644 index 0000000000..8048164ac6 --- /dev/null +++ b/examples/train_full/qwen3_4b_full_sft.yaml @@ -0,0 +1,58 @@ +### model +model_name_or_path: Qwen/Qwen3-4B +trust_remote_code: true +flash_attn: auto + +### method +stage: sft +do_train: true +finetuning_type: full +# deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + +### dataset +dataset: big_reasoning_traces +template: qwen3 +cutoff_len: 1024 # Increased for reasoning tasks +max_samples: 200 # Adjust based on your needs +overwrite_cache: true +preprocessing_num_workers: 2 +dataloader_num_workers: 0 # Disabled to avoid multiprocessing issues with gradient checkpointing +enable_thinking: true # Enable thinking mode for reasoning tasks +cache_dir: /workspace/datasets/ + +### output +output_dir: /workspace/models/qwen3-4b/full/sft +logging_steps: 10 +save_steps: 5000 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: wandb # choices: [none, wandb, tensorboard, swanlab, mlflow] +run_name: qwen3-4b-full-sft + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 # Increased for better gradient estimates +learning_rate: 5.0e-6 # Lower learning rate for full fine-tuning +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# eval_dataset: big_reasoning_traces +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 + +try: + from torch.distributed.tensor import DTensor +except ImportError: + DTensor = None + +if DTensor is not None and isinstance(tensor, DTensor): + local_tensor = tensor.to_local() + return tensor.device, local_tensor.storage().data_ptr(), tensor.nbytes \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 917a51d199..4bdf27e6d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ transformers>=4.45.0,<=4.52.4,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform != 'darwin' transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform == 'darwin' -datasets>=2.16.0,<=3.6.0 +datasets>=2.14.0 accelerate>=1.3.0,<=1.7.0 peft>=0.14.0,<=0.15.2 trl>=0.8.6,<=0.9.6 @@ -25,3 +25,9 @@ pandas>=2.0.0 av librosa tyro<0.9.0 +deepspeed>=0.9.3 +wandb +huggingface-hub>=0.16.0 +torch>=2.4.0 +bitsandbytes +lm_eval diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 7c0ce2f48f..4428165189 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -52,6 +52,17 @@ logger = logging.get_logger(__name__) +def _apply_qwen3_patch(): + """Apply patch for Qwen3 models in transformers 4.52.4.""" + try: + import transformers.modeling_utils + if transformers.modeling_utils.ALL_PARALLEL_STYLES is None: + transformers.modeling_utils.ALL_PARALLEL_STYLES = ['colwise', 'rowwise'] + logger.info_rank0("Applied Qwen3 compatibility patch for transformers 4.52.4") + except Exception as e: + logger.warning_rank0(f"Failed to apply Qwen3 patch: {e}") + + class TokenizerModule(TypedDict): tokenizer: "PreTrainedTokenizer" processor: Optional["ProcessorMixin"] @@ -140,6 +151,9 @@ def load_model( add_valuehead: bool = False, ) -> "PreTrainedModel": r"""Load pretrained model.""" + # Apply Qwen3 patch if needed + _apply_qwen3_patch() + init_kwargs = _get_init_kwargs(model_args) config = load_config(model_args) patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py index 714aca0334..001409d3a7 100644 --- a/src/llamafactory/model/model_utils/checkpointing.py +++ b/src/llamafactory/model/model_utils/checkpointing.py @@ -77,8 +77,7 @@ def backward(ctx: "torch.autograd.Function", grad_output: "torch.Tensor") -> "to def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable) -> Callable: r"""Only applies gradient checkpointing to trainable layers.""" - - @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",)) + def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs): if isinstance(func, partial): module: torch.nn.Module = func.func.__self__ @@ -98,6 +97,10 @@ def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tenso else: return func(*args, **kwargs) + # Set the function name and module for better pickling + custom_gradient_checkpointing_func.__name__ = "custom_gradient_checkpointing_func" + custom_gradient_checkpointing_func.__module__ = "llamafactory.model.model_utils.checkpointing" + return custom_gradient_checkpointing_func