diff --git a/bootstrap.sh b/bootstrap.sh
new file mode 100644
index 0000000000..696cc23c03
--- /dev/null
+++ b/bootstrap.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -e
+
+echo "=== [Step 1] Updating package lists ==="
+apt update
+
+echo "=== [Step 2] Installing basic tools ==="
+apt install -y \
+  vim \
+  git \
+  curl \
+  gnupg \
+  wget \
+  unzip \
+  htop \
+  tmux \
+  python3-pip \
+  bash-completion
+
+echo "=== [Step 3] Setting up Python environment ==="
+pip3 install --upgrade pip
+pip3 install virtualenv ipython
+pip install -r requirements.txt
+
+echo "=== [Step 4] Installing oh-my-bash ==="
+if [ ! -d "$HOME/.oh-my-bash" ]; then
+  git clone https://github.com/ohmybash/oh-my-bash.git ~/.oh-my-bash
+  cp ~/.oh-my-bash/templates/bashrc.osh-template ~/.bashrc
+  sed -i 's/^OSH_THEME=.*/OSH_THEME="font"/' ~/.bashrc
+fi
+
+echo "=== [Step 5] Configuring Git ==="
+# Replace the values below with your identity if needed
+git config --global user.name "Yexi Jiang"
+git config --global user.email "2237303+yxjiang@users.noreply.github.com"
+git config --global init.defaultBranch main
+git config --global core.editor vim
+git config --global color.ui auto
+
+# Soft link ssh and Git setting
+# Ensure /workspace exists and is writable
+if [ ! -d "/workspace" ]; then
+  mkdir -p /workspace
+fi
+chmod u+w /workspace
+
+mkdir -p ~/.ssh
+cp /workspace/bootstrap/config/.ssh/id_ed25519 ~/.ssh/id_ed25519
+cp /workspace/bootstrap/config/.ssh/id_ed25519.pub ~/.ssh/id_ed25519.pub
+cp /workspace/bootstrap/config/.ssh/config ~/.ssh/config
+cp /workspace/bootstrap/config/.gitconfig ~/.gitconfig
+
+chmod 600 ~/.ssh/id_ed25519
+
+# Start ssh-agent
+eval "$(ssh-agent -s)"
+ssh-add ~/.ssh/id_ed25519
+
+echo "=== [Step 6] Install Ollama ==="
+# Install Ollama
+curl -fsSL https://ollama.com/install.sh | sh
+ollama serve &
+
+echo "=== [Step 7] Ramp up workspace directory ==="
+source .bashrc
+cat /workspace/bootstrap/.bashrc >> ~/.bashrc
+
+echo "=== Done ==="
+echo "Run 'source ~/.bashrc' to activate oh-my-bash"
+source ~/.bashrc
+
+# Create folders if not exist
+mkdir -p /workspace/models
+mkdir -p /workspace/data
\ No newline at end of file
diff --git a/data/dataset_info.json b/data/dataset_info.json
index 1ce4639604..266e89c532 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -714,5 +714,20 @@
       "prompt": "content"
     },
     "folder": "python"
+  },
+  "big_reasoning_traces": {
+    "hf_hub_url": "tech-tao/big-reasoning-traces-100k",
+    "formatting": "sharegpt",
+    "subset": "default",
+    "columns": {
+      "messages": "messages"
+    },
+    "tags": {
+      "role_tag": "role",
+      "content_tag": "content",
+      "user_tag": "user",
+      "assistant_tag": "assistant",
+      "system_tag": "system"
+    }
   }
 }
diff --git a/download_big_reasoning_traces.py b/download_big_reasoning_traces.py
new file mode 100644
index 0000000000..4f53c82450
--- /dev/null
+++ b/download_big_reasoning_traces.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Script to download big-reasoning-traces dataset, extract first N records,
+and upload to Hugging Face account.
+
+Usage:
+    python download_big_reasoning_traces.py --hf_username tech-tao --dataset_name big-reasoning-traces-100k --max_records 10000 --streaming
+"""
+
+import argparse
+import logging
+from typing import Optional
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi, login
+import os
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def download_and_process_dataset(
+    source_dataset: str = "allenai/big-reasoning-traces",
+    subset: str = "DeepSeek",
+    max_records: int = 100000,
+    use_streaming: bool = False
+) -> Dataset:
+    """
+    Download the big-reasoning-traces dataset and extract first N records.
+    
+    Args:
+        source_dataset: Source dataset name on Hugging Face
+        subset: Dataset subset to use (DeepSeek, OpenThoughts, etc.)
+        max_records: Maximum number of records to extract
+        use_streaming: Use streaming to avoid loading entire dataset in memory
+        
+    Returns:
+        Processed dataset with first N records
+    """
+    logger.info(f"Downloading dataset: {source_dataset}")
+    logger.info(f"Using subset: {subset}")
+    logger.info(f"Extracting first {max_records:,} records")
+    logger.info(f"Streaming mode: {use_streaming}")
+    
+    try:
+        if use_streaming:
+            # Use streaming for memory-efficient processing
+            logger.info("Using streaming mode for memory efficiency...")
+            dataset_iter = load_dataset(
+                source_dataset, 
+                subset, 
+                split="train", 
+                streaming=True
+            )
+            
+            # Collect first N records
+            records = []
+            for i, record in enumerate(dataset_iter):
+                if i >= max_records:
+                    break
+                records.append(record)
+                if (i + 1) % 10000 == 0:
+                    logger.info(f"Processed {i + 1:,} records...")
+            
+            # Convert to Dataset
+            dataset = Dataset.from_list(records)
+            logger.info(f"Collected {len(dataset):,} records via streaming")
+            
+        else:
+            # Load entire dataset (uses more memory)
+            logger.info("Loading entire dataset into memory...")
+            dataset = load_dataset(source_dataset, subset, split="train")
+            logger.info(f"Original dataset size: {len(dataset):,} records")
+            
+            # Extract first N records
+            if len(dataset) > max_records:
+                dataset = dataset.select(range(max_records))
+                logger.info(f"Truncated to {len(dataset):,} records")
+            else:
+                logger.warning(f"Dataset has only {len(dataset):,} records, less than requested {max_records:,}")
+        
+        # Show dataset structure
+        logger.info(f"Dataset features: {dataset.features}")
+        logger.info(f"Sample record keys: {list(dataset[0].keys())}")
+        
+        return dataset
+        
+    except Exception as e:
+        logger.error(f"Error downloading dataset: {e}")
+        raise
+
+
+def upload_to_huggingface(
+    dataset: Dataset,
+    username: str,
+    dataset_name: str,
+    private: bool = False
+) -> str:
+    """
+    Upload dataset to Hugging Face Hub.
+    
+    Args:
+        dataset: Dataset to upload
+        username: Hugging Face username
+        dataset_name: Name for the new dataset
+        private: Whether to make the dataset private
+        
+    Returns:
+        URL of the uploaded dataset
+    """
+    repo_id = f"{username}/{dataset_name}"
+    
+    logger.info(f"Uploading dataset to: {repo_id}")
+    logger.info(f"Dataset size: {len(dataset):,} records")
+    logger.info(f"Private: {private}")
+    
+    try:
+        # Push to hub
+        dataset.push_to_hub(
+            repo_id=repo_id,
+            private=private,
+            commit_message="Initial upload: First 100k records from big-reasoning-traces"
+        )
+        
+        dataset_url = f"https://huggingface.co/datasets/{repo_id}"
+        logger.info(f"Successfully uploaded dataset to: {dataset_url}")
+        
+        return dataset_url
+        
+    except Exception as e:
+        logger.error(f"Error uploading dataset: {e}")
+        raise
+
+
+def main():
+    """Main function to orchestrate the download and upload process."""
+    parser = argparse.ArgumentParser(
+        description="Download big-reasoning-traces dataset and upload subset to Hugging Face"
+    )
+    parser.add_argument(
+        "--hf_username",
+        type=str,
+        required=True,
+        help="Your Hugging Face username"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        required=True,
+        help="Name for your new dataset"
+    )
+    parser.add_argument(
+        "--max_records",
+        type=int,
+        default=100000,
+        help="Maximum number of records to extract (default: 100000)"
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        default="DeepSeek",
+        choices=["DeepSeek", "OpenThoughts", "OpenR1-Math"],
+        help="Dataset subset to use (default: DeepSeek)"
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Make the uploaded dataset private"
+    )
+    parser.add_argument(
+        "--hf_token",
+        type=str,
+        help="Hugging Face token (or set HF_TOKEN environment variable)"
+    )
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        help="Use streaming mode for memory-efficient processing of large datasets"
+    )
+    
+    args = parser.parse_args()
+    
+    # Login to Hugging Face
+    token = args.hf_token or os.getenv("HF_TOKEN")
+    if not token:
+        logger.error("Hugging Face token required. Set HF_TOKEN environment variable or use --hf_token")
+        return 1
+    
+    try:
+        login(token=token)
+        logger.info("Successfully logged in to Hugging Face")
+    except Exception as e:
+        logger.error(f"Failed to login to Hugging Face: {e}")
+        return 1
+    
+    try:
+        # Download and process dataset
+        dataset = download_and_process_dataset(
+            subset=args.subset,
+            max_records=args.max_records,
+            use_streaming=args.streaming
+        )
+        
+        # Upload to Hugging Face
+        dataset_url = upload_to_huggingface(
+            dataset=dataset,
+            username=args.hf_username,
+            dataset_name=args.dataset_name,
+            private=args.private
+        )
+        
+        logger.info("=" * 60)
+        logger.info("SUCCESS!")
+        logger.info(f"Dataset uploaded to: {dataset_url}")
+        logger.info(f"Records: {len(dataset):,}")
+        logger.info(f"Features: {list(dataset.features.keys())}")
+        logger.info("=" * 60)
+        
+        return 0
+        
+    except Exception as e:
+        logger.error(f"Process failed: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main()) 
\ No newline at end of file
diff --git a/examples/inference/qwen3.yaml b/examples/inference/qwen3.yaml
new file mode 100644
index 0000000000..c3ce4e7588
--- /dev/null
+++ b/examples/inference/qwen3.yaml
@@ -0,0 +1,4 @@
+model_name_or_path: Qwen/Qwen3-4B
+template: qwen3
+infer_backend: huggingface  # choices: [huggingface, vllm, sglang]
+trust_remote_code: true
diff --git a/examples/train_full/README_qwen3_4b_reasoning.md b/examples/train_full/README_qwen3_4b_reasoning.md
new file mode 100644
index 0000000000..cfee07b19d
--- /dev/null
+++ b/examples/train_full/README_qwen3_4b_reasoning.md
@@ -0,0 +1,70 @@
+# Qwen3-4B Training Configurations for Big Reasoning Traces
+
+This directory contains training configurations for fine-tuning Qwen3-4B on the Big Reasoning Traces dataset from AllenAI.
+
+## Dataset
+
+- **Source**: [allenai/big-reasoning-traces](https://huggingface.co/datasets/allenai/big-reasoning-traces)
+- **Subset**: `deepseek_debug`
+- **Format**: Question-Answer pairs for reasoning tasks
+- **Dataset Config**: Added to `data/dataset_info.json`
+
+## Configurations
+
+### 1. Full Fine-tuning (`qwen3_4b_full_sft.yaml`)
+- **Method**: Full parameter fine-tuning
+- **Memory**: High (requires significant GPU memory)
+- **Learning Rate**: 5.0e-6 (lower for full fine-tuning)
+- **Batch Size**: 1 with gradient accumulation of 4
+- **Context Length**: 4096 tokens
+
+### 2. LoRA Fine-tuning (`qwen3_4b_lora_sft.yaml`)
+- **Method**: Low-Rank Adaptation (LoRA)
+- **Memory**: Moderate
+- **Learning Rate**: 1.0e-4 (higher for LoRA)
+- **LoRA Rank**: 16 (higher for reasoning tasks)
+- **Batch Size**: 2 with gradient accumulation of 2
+
+### 3. QLoRA Fine-tuning (`qwen3_4b_qlora_sft.yaml`)
+- **Method**: Quantized LoRA (4-bit quantization)
+- **Memory**: Low (most memory efficient)
+- **Learning Rate**: 1.0e-4
+- **LoRA Rank**: 16
+- **Batch Size**: 2 with gradient accumulation of 4
+
+## Key Features
+
+- **Thinking Mode**: Enabled (`enable_thinking: true`) for enhanced reasoning capabilities
+- **Template**: Uses `qwen3` template optimized for Qwen3 models
+- **Context Length**: Increased to 4096 tokens for longer reasoning chains
+- **DeepSpeed**: Uses ZeRO-3 for distributed training
+
+## Usage
+
+### Full Fine-tuning
+```bash
+llamafactory-cli train examples/train_full/qwen3_4b_full_sft.yaml
+```
+
+### LoRA Fine-tuning
+```bash
+llamafactory-cli train examples/train_lora/qwen3_4b_lora_sft.yaml
+```
+
+### QLoRA Fine-tuning
+```bash
+llamafactory-cli train examples/train_qlora/qwen3_4b_qlora_sft.yaml
+```
+
+## Hardware Requirements
+
+- **Full Fine-tuning**: 4x A100 80GB or equivalent
+- **LoRA**: 2x A100 40GB or equivalent
+- **QLoRA**: 1x A100 24GB or equivalent
+
+## Notes
+
+- Adjust `max_samples` based on your dataset size and training time constraints
+- The `enable_thinking` parameter is crucial for reasoning tasks with Qwen3
+- Consider enabling evaluation by uncommenting the eval section for monitoring training progress
+- Output models will be saved in `saves/qwen3-4b/` directory structure 
\ No newline at end of file
diff --git a/examples/train_full/qwen3_4b_full_sft.yaml b/examples/train_full/qwen3_4b_full_sft.yaml
new file mode 100644
index 0000000000..8048164ac6
--- /dev/null
+++ b/examples/train_full/qwen3_4b_full_sft.yaml
@@ -0,0 +1,58 @@
+### model
+model_name_or_path: Qwen/Qwen3-4B
+trust_remote_code: true
+flash_attn: auto
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+# deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+
+### dataset
+dataset: big_reasoning_traces
+template: qwen3
+cutoff_len: 1024  # Increased for reasoning tasks
+max_samples: 200  # Adjust based on your needs
+overwrite_cache: true
+preprocessing_num_workers: 2
+dataloader_num_workers: 0  # Disabled to avoid multiprocessing issues with gradient checkpointing
+enable_thinking: true  # Enable thinking mode for reasoning tasks
+cache_dir: /workspace/datasets/
+
+### output
+output_dir: /workspace/models/qwen3-4b/full/sft
+logging_steps: 10
+save_steps: 5000
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: wandb  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+run_name: qwen3-4b-full-sft
+
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 8  # Increased for better gradient estimates
+learning_rate: 5.0e-6  # Lower learning rate for full fine-tuning
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: big_reasoning_traces
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500 
+
+try:
+    from torch.distributed.tensor import DTensor
+except ImportError:
+    DTensor = None
+
+if DTensor is not None and isinstance(tensor, DTensor):
+    local_tensor = tensor.to_local()
+    return tensor.device, local_tensor.storage().data_ptr(), tensor.nbytes 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 917a51d199..4bdf27e6d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 transformers>=4.45.0,<=4.52.4,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform != 'darwin'
 transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform == 'darwin'
-datasets>=2.16.0,<=3.6.0
+datasets>=2.14.0
 accelerate>=1.3.0,<=1.7.0
 peft>=0.14.0,<=0.15.2
 trl>=0.8.6,<=0.9.6
@@ -25,3 +25,9 @@ pandas>=2.0.0
 av
 librosa
 tyro<0.9.0
+deepspeed>=0.9.3
+wandb
+huggingface-hub>=0.16.0
+torch>=2.4.0
+bitsandbytes
+lm_eval
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 7c0ce2f48f..4428165189 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -52,6 +52,17 @@
 logger = logging.get_logger(__name__)
 
 
+def _apply_qwen3_patch():
+    """Apply patch for Qwen3 models in transformers 4.52.4."""
+    try:
+        import transformers.modeling_utils
+        if transformers.modeling_utils.ALL_PARALLEL_STYLES is None:
+            transformers.modeling_utils.ALL_PARALLEL_STYLES = ['colwise', 'rowwise']
+            logger.info_rank0("Applied Qwen3 compatibility patch for transformers 4.52.4")
+    except Exception as e:
+        logger.warning_rank0(f"Failed to apply Qwen3 patch: {e}")
+
+
 class TokenizerModule(TypedDict):
     tokenizer: "PreTrainedTokenizer"
     processor: Optional["ProcessorMixin"]
@@ -140,6 +151,9 @@ def load_model(
     add_valuehead: bool = False,
 ) -> "PreTrainedModel":
     r"""Load pretrained model."""
+    # Apply Qwen3 patch if needed
+    _apply_qwen3_patch()
+    
     init_kwargs = _get_init_kwargs(model_args)
     config = load_config(model_args)
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index 714aca0334..001409d3a7 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -77,8 +77,7 @@ def backward(ctx: "torch.autograd.Function", grad_output: "torch.Tensor") -> "to
 
 def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable) -> Callable:
     r"""Only applies gradient checkpointing to trainable layers."""
-
-    @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",))
+    
     def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs):
         if isinstance(func, partial):
             module: torch.nn.Module = func.func.__self__
@@ -98,6 +97,10 @@ def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tenso
         else:
             return func(*args, **kwargs)
 
+    # Set the function name and module for better pickling
+    custom_gradient_checkpointing_func.__name__ = "custom_gradient_checkpointing_func"
+    custom_gradient_checkpointing_func.__module__ = "llamafactory.model.model_utils.checkpointing"
+    
     return custom_gradient_checkpointing_func