Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash
set -e

echo "=== [Step 1] Updating package lists ==="
apt update

echo "=== [Step 2] Installing basic tools ==="
apt install -y \
vim \
git \
curl \
gnupg \
wget \
unzip \
htop \
tmux \
python3-pip \
bash-completion

echo "=== [Step 3] Setting up Python environment ==="
pip3 install --upgrade pip
pip3 install virtualenv ipython
pip install -r requirements.txt

echo "=== [Step 4] Installing oh-my-bash ==="
if [ ! -d "$HOME/.oh-my-bash" ]; then
git clone https://github.com/ohmybash/oh-my-bash.git ~/.oh-my-bash
cp ~/.oh-my-bash/templates/bashrc.osh-template ~/.bashrc
sed -i 's/^OSH_THEME=.*/OSH_THEME="font"/' ~/.bashrc
fi

echo "=== [Step 5] Configuring Git ==="
# Replace the values below with your identity if needed
git config --global user.name "Yexi Jiang"
git config --global user.email "2237303+yxjiang@users.noreply.github.com"
git config --global init.defaultBranch main
git config --global core.editor vim
git config --global color.ui auto

# Soft link ssh and Git setting
# Ensure /workspace exists and is writable
if [ ! -d "/workspace" ]; then
mkdir -p /workspace
fi
chmod u+w /workspace

mkdir -p ~/.ssh
cp /workspace/bootstrap/config/.ssh/id_ed25519 ~/.ssh/id_ed25519
cp /workspace/bootstrap/config/.ssh/id_ed25519.pub ~/.ssh/id_ed25519.pub
cp /workspace/bootstrap/config/.ssh/config ~/.ssh/config
cp /workspace/bootstrap/config/.gitconfig ~/.gitconfig

chmod 600 ~/.ssh/id_ed25519

# Start ssh-agent
eval "$(ssh-agent -s)"
ssh-add ~/.ssh/id_ed25519

echo "=== [Step 6] Install Ollama ==="
# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh
ollama serve &

echo "=== [Step 7] Ramp up workspace directory ==="
source .bashrc
cat /workspace/bootstrap/.bashrc >> ~/.bashrc

echo "=== Done ==="
echo "Run 'source ~/.bashrc' to activate oh-my-bash"
source ~/.bashrc

# Create folders if not exist
mkdir -p /workspace/models
mkdir -p /workspace/data
15 changes: 15 additions & 0 deletions data/dataset_info.json
Original file line number Diff line number Diff line change
Expand Up @@ -714,5 +714,20 @@
"prompt": "content"
},
"folder": "python"
},
"big_reasoning_traces": {
"hf_hub_url": "tech-tao/big-reasoning-traces-100k",
"formatting": "sharegpt",
"subset": "default",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
}
}
230 changes: 230 additions & 0 deletions download_big_reasoning_traces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Script to download big-reasoning-traces dataset, extract first N records,
and upload to Hugging Face account.

Usage:
python download_big_reasoning_traces.py --hf_username tech-tao --dataset_name big-reasoning-traces-100k --max_records 10000 --streaming
"""

import argparse
import logging
from typing import Optional
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, login
import os

# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def download_and_process_dataset(
source_dataset: str = "allenai/big-reasoning-traces",
subset: str = "DeepSeek",
max_records: int = 100000,
use_streaming: bool = False
) -> Dataset:
"""
Download the big-reasoning-traces dataset and extract first N records.

Args:
source_dataset: Source dataset name on Hugging Face
subset: Dataset subset to use (DeepSeek, OpenThoughts, etc.)
max_records: Maximum number of records to extract
use_streaming: Use streaming to avoid loading entire dataset in memory

Returns:
Processed dataset with first N records
"""
logger.info(f"Downloading dataset: {source_dataset}")
logger.info(f"Using subset: {subset}")
logger.info(f"Extracting first {max_records:,} records")
logger.info(f"Streaming mode: {use_streaming}")

try:
if use_streaming:
# Use streaming for memory-efficient processing
logger.info("Using streaming mode for memory efficiency...")
dataset_iter = load_dataset(
source_dataset,
subset,
split="train",
streaming=True
)

# Collect first N records
records = []
for i, record in enumerate(dataset_iter):
if i >= max_records:
break
records.append(record)
if (i + 1) % 10000 == 0:
logger.info(f"Processed {i + 1:,} records...")

# Convert to Dataset
dataset = Dataset.from_list(records)
logger.info(f"Collected {len(dataset):,} records via streaming")

else:
# Load entire dataset (uses more memory)
logger.info("Loading entire dataset into memory...")
dataset = load_dataset(source_dataset, subset, split="train")
logger.info(f"Original dataset size: {len(dataset):,} records")

# Extract first N records
if len(dataset) > max_records:
dataset = dataset.select(range(max_records))
logger.info(f"Truncated to {len(dataset):,} records")
else:
logger.warning(f"Dataset has only {len(dataset):,} records, less than requested {max_records:,}")

# Show dataset structure
logger.info(f"Dataset features: {dataset.features}")
logger.info(f"Sample record keys: {list(dataset[0].keys())}")

return dataset

except Exception as e:
logger.error(f"Error downloading dataset: {e}")
raise


def upload_to_huggingface(
dataset: Dataset,
username: str,
dataset_name: str,
private: bool = False
) -> str:
"""
Upload dataset to Hugging Face Hub.

Args:
dataset: Dataset to upload
username: Hugging Face username
dataset_name: Name for the new dataset
private: Whether to make the dataset private

Returns:
URL of the uploaded dataset
"""
repo_id = f"{username}/{dataset_name}"

logger.info(f"Uploading dataset to: {repo_id}")
logger.info(f"Dataset size: {len(dataset):,} records")
logger.info(f"Private: {private}")

try:
# Push to hub
dataset.push_to_hub(
repo_id=repo_id,
private=private,
commit_message="Initial upload: First 100k records from big-reasoning-traces"
)

dataset_url = f"https://huggingface.co/datasets/{repo_id}"
logger.info(f"Successfully uploaded dataset to: {dataset_url}")

return dataset_url

except Exception as e:
logger.error(f"Error uploading dataset: {e}")
raise


def main():
"""Main function to orchestrate the download and upload process."""
parser = argparse.ArgumentParser(
description="Download big-reasoning-traces dataset and upload subset to Hugging Face"
)
parser.add_argument(
"--hf_username",
type=str,
required=True,
help="Your Hugging Face username"
)
parser.add_argument(
"--dataset_name",
type=str,
required=True,
help="Name for your new dataset"
)
parser.add_argument(
"--max_records",
type=int,
default=100000,
help="Maximum number of records to extract (default: 100000)"
)
parser.add_argument(
"--subset",
type=str,
default="DeepSeek",
choices=["DeepSeek", "OpenThoughts", "OpenR1-Math"],
help="Dataset subset to use (default: DeepSeek)"
)
parser.add_argument(
"--private",
action="store_true",
help="Make the uploaded dataset private"
)
parser.add_argument(
"--hf_token",
type=str,
help="Hugging Face token (or set HF_TOKEN environment variable)"
)
parser.add_argument(
"--streaming",
action="store_true",
help="Use streaming mode for memory-efficient processing of large datasets"
)

args = parser.parse_args()

# Login to Hugging Face
token = args.hf_token or os.getenv("HF_TOKEN")
if not token:
logger.error("Hugging Face token required. Set HF_TOKEN environment variable or use --hf_token")
return 1

try:
login(token=token)
logger.info("Successfully logged in to Hugging Face")
except Exception as e:
logger.error(f"Failed to login to Hugging Face: {e}")
return 1

try:
# Download and process dataset
dataset = download_and_process_dataset(
subset=args.subset,
max_records=args.max_records,
use_streaming=args.streaming
)

# Upload to Hugging Face
dataset_url = upload_to_huggingface(
dataset=dataset,
username=args.hf_username,
dataset_name=args.dataset_name,
private=args.private
)

logger.info("=" * 60)
logger.info("SUCCESS!")
logger.info(f"Dataset uploaded to: {dataset_url}")
logger.info(f"Records: {len(dataset):,}")
logger.info(f"Features: {list(dataset.features.keys())}")
logger.info("=" * 60)

return 0

except Exception as e:
logger.error(f"Process failed: {e}")
return 1


if __name__ == "__main__":
exit(main())
4 changes: 4 additions & 0 deletions examples/inference/qwen3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
model_name_or_path: Qwen/Qwen3-4B
template: qwen3
infer_backend: huggingface # choices: [huggingface, vllm, sglang]
trust_remote_code: true
Loading