Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions project/paperbench/data/papers/README.md
Git LFS file not shown
70 changes: 70 additions & 0 deletions project/paperbench/data/papers/dataset_card.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
language:
- en

license: mit

size_categories:
- n<1K

task_categories:
- other

pretty_name: PaperBench

tags:
- ai-research
- code-generation
- agent-evaluation
- machine-learning
- research-replication
- benchmark
- ai-agents
- code-completion

annotations_creators:
- expert-generated

language_creators:
- found

source_datasets:
- original

multilinguality:
- monolingual

task_ids: []

paperswithcode_id: paperbench

configs:
- config_name: default
data_files:
- split: train
path: data/train-*

dataset_info:
features:
- name: id
dtype: string
- name: title
dtype: string
- name: blacklisted_sites
dtype:
list: string
- name: num_rubric_tasks
dtype: int64
- name: reference_files
dtype:
list: string
- name: reference_file_urls
dtype:
list: string
- name: reference_file_hf_uris
dtype:
list: string
splits:
- name: train
num_examples: 20
download_size: 0
dataset_size: 0
253 changes: 253 additions & 0 deletions project/paperbench/paperbench/scripts/run_upload_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
from __future__ import annotations

import argparse
import io
import json
import shutil
import sys
import tempfile
from pathlib import Path

import structlog.stdlib
import yaml
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, create_repo, login, upload_file

PROJECT_ROOT = Path(__file__).resolve().parents[2]
DEFAULT_PAPERS_DIR = PROJECT_ROOT / "data" / "papers"
DEFAULT_CARD_PATH = DEFAULT_PAPERS_DIR / "README.md"
DEFAULT_YAML_CARD_PATH = DEFAULT_PAPERS_DIR / "dataset_card.yaml"

logger = structlog.stdlib.get_logger(component=__name__)


def count_rubric_tasks(rubric: dict) -> int:
count = 1
for subtask in rubric.get("sub_tasks", []):
count += count_rubric_tasks(subtask)
return count


def count_rubric_leaf_nodes(rubric: dict) -> int:
"""Count the number of leaf nodes (nodes with no subtasks) in the rubric."""
if not rubric.get("sub_tasks", []):
return 1

count = 0
for subtask in rubric["sub_tasks"]:
count += count_rubric_leaf_nodes(subtask)
return count


def count_rubric_by_category(rubric: dict) -> dict[str, int]:
counts = {"Code Development": 0, "Code Execution": 0, "Result Analysis": 0}
if not rubric.get("sub_tasks", []) and rubric.get("task_category"):
category = rubric["task_category"]
if category in counts:
counts[category] += 1

for subtask in rubric.get("sub_tasks", []):
subcounts = count_rubric_by_category(subtask)
for category, count in subcounts.items():
counts[category] += count

return counts


def extract_paper_metadata(paper_dir: Path, repo_id: str) -> dict:
paper_id = paper_dir.name
row = {}

with open(paper_dir / "config.yaml") as f:
config = yaml.safe_load(f)
row["id"] = config.get("id", paper_id)
row["title"] = config.get("title", "")

with open(paper_dir / "blacklist.txt") as f:
row["blacklisted_sites"] = [
line.strip() for line in f if line.strip() if line.strip() != "none"
]

with open(paper_dir / "rubric.json") as f:
rubric = json.load(f)
row["rubric_requirements"] = rubric.get("requirements")
row["rubric_total_nodes"] = count_rubric_tasks(rubric)
row["rubric_leaf_nodes"] = count_rubric_leaf_nodes(rubric)

category_counts = count_rubric_by_category(rubric)
row["rubric_code_development"] = category_counts["Code Development"]
row["rubric_code_execution"] = category_counts["Code Execution"]
row["rubric_result_analysis"] = category_counts["Result Analysis"]

reference_files = []
reference_file_urls = []
reference_file_hf_uris = []

for file_path in sorted(paper_dir.rglob("*")):
if file_path.is_file():
relative_path = file_path.relative_to(paper_dir.parent)
relative_path_str = str(relative_path)

reference_files.append(relative_path_str)
reference_file_urls.append(
f"https://huggingface.co/datasets/{repo_id}/resolve/main/{relative_path_str}"
)
reference_file_hf_uris.append(f"hf://datasets/{repo_id}@main/{relative_path_str}")

row["reference_files"] = reference_files
row["reference_file_urls"] = reference_file_urls
row["reference_file_hf_uris"] = reference_file_hf_uris

return row


def build_manifest(papers_dir: Path, repo_id: str) -> list[dict]:
rows = []
for paper_dir in sorted(papers_dir.iterdir()):
if not paper_dir.is_dir() or paper_dir.name.startswith("."):
continue
rows.append(extract_paper_metadata(paper_dir, repo_id))
return rows


def build_and_upload_data_parquet(
api: HfApi,
repo_id: str,
branch: str,
commit_message: str,
) -> None:
# Build manifest for dataset viewer
logger.info("Building dataset manifest...")
manifest_rows = build_manifest(DEFAULT_PAPERS_DIR, repo_id)
logger.info(f"Found {len(manifest_rows)} papers")

with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)

# Copy all paper directories
logger.info("Copying paper directories...")
for paper_dir in DEFAULT_PAPERS_DIR.iterdir():
if paper_dir.is_dir() and not paper_dir.name.startswith("."):
shutil.copytree(paper_dir, temp_path / paper_dir.name)

data_dir = temp_path / "data"
data_dir.mkdir(exist_ok=True)

# Create and save dataset as parquet
logger.info("Creating dataset parquet file...")
dataset = Dataset.from_list(manifest_rows)
parquet_path = data_dir / "train-00000-of-00001.parquet"
dataset.to_parquet(str(parquet_path))
logger.info(f"Dataset preview schema: {dataset.column_names}")

# Upload the prepared directory
logger.info(f"Uploading to {repo_id}...")
try:
api.upload_folder(
folder_path=str(temp_path),
repo_id=repo_id,
repo_type="dataset",
revision=branch,
commit_message=commit_message,
)
except Exception as exc: # noqa: BLE001
logger.error(f"Failed to upload dataset folder: {exc}")
sys.exit(1)


def upload_dataset_card(repo_id: str, branch: str, commit_message: str) -> None:
# Upload README card
logger.info("Uploading README.md...")
readme_content = DEFAULT_CARD_PATH.read_text(encoding="utf-8")

upload_file(
path_or_fileobj=io.BytesIO(readme_content.encode("utf-8")),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
revision=branch,
commit_message=commit_message,
)
logger.info("README.md uploaded successfully")

yaml_content = DEFAULT_YAML_CARD_PATH.read_text(encoding="utf-8")
upload_file(
path_or_fileobj=io.BytesIO(yaml_content.encode("utf-8")),
path_in_repo="dataset_card.yaml",
repo_id=repo_id,
repo_type="dataset",
revision=branch,
commit_message=commit_message,
)
logger.info("dataset_card.yaml uploaded successfully")


def ensure_upload(repo_id: str, paper_idx: int) -> None:
logger.info("Ensuring upload...")
dataset = load_dataset(repo_id)
api = HfApi()
paper_id = dataset["train"][paper_idx]["id"]
downloaded_paths = []
for file in dataset["train"][paper_idx]["reference_files"]:
local_path = api.hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
downloaded_paths.append(local_path)

paper_path = Path(downloaded_paths[0]).parent
logger.info(f"Downloaded paper and rubric for {paper_id} successfully to: {paper_path}")


def main() -> None:
args = parse_args()
api = HfApi()
user_info = api.whoami()
if not user_info:
login()
username = user_info["name"]
repo_id = f"{username}/{args.repo}"

try:
api.repo_info(repo_id, repo_type="dataset")
logger.info(f"Repository {repo_id} already exists")
except: # noqa: E722
create_repo(repo_id=repo_id, repo_type="dataset", private=args.private, exist_ok=True)
logger.info(f"Created repository: {repo_id}")

build_and_upload_data_parquet(api, repo_id, args.branch, args.commit_message)
upload_dataset_card(repo_id, args.branch, args.commit_message)

logger.info("Dataset upload complete.")
logger.info(f"View at: https://huggingface.co/datasets/{repo_id}")
ensure_upload(repo_id, paper_idx=0)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Upload the PaperBench papers dataset to the Hugging Face Hub.",
)
parser.add_argument(
"repo",
help=(
"Target dataset repository. Accepts either a full '<namespace>/<name>' "
"or just the repository name (defaults to your user namespace)."
),
)
parser.add_argument(
"--private",
action="store_true",
help="Create the Hugging Face dataset repository as private.",
)
parser.add_argument(
"--branch",
default="main",
help="Branch or revision to push to (default: main).",
)
parser.add_argument(
"--commit-message",
default="Upload PaperBench papers dataset",
help="Custom commit message for the dataset upload.",
)
return parser.parse_args()


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions project/paperbench/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ typeCheckingMode = "off" # we use mypy

[dependency-groups]
dev = [
"datasets>=4.1.1",
"huggingface-hub>=0.35.3",
"mypy>=1.15.0",
"poethepoet>=0.34.0",
"pre-commit>=4.2.0",
Expand Down
Loading