openai · josancamon19 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/project/paperbench/data/papers/README.md b/project/paperbench/data/papers/README.md
diff --git a/project/paperbench/data/papers/dataset_card.yaml b/project/paperbench/data/papers/dataset_card.yaml
@@ -0,0 +1,70 @@
+language:
+  - en
+
+license: mit
+
+size_categories:
+  - n<1K
+
+task_categories:
+  - other
+
+pretty_name: PaperBench
+
+tags:
+  - ai-research
+  - code-generation
+  - agent-evaluation
+  - machine-learning
+  - research-replication
+  - benchmark
+  - ai-agents
+  - code-completion
+
+annotations_creators:
+  - expert-generated
+
+language_creators:
+  - found
+
+source_datasets:
+  - original
+
+multilinguality:
+  - monolingual
+
+task_ids: []
+
+paperswithcode_id: paperbench
+
+configs:
+  - config_name: default
+    data_files:
+      - split: train
+        path: data/train-*
+
+dataset_info:
+  features:
+    - name: id
+      dtype: string
+    - name: title
+      dtype: string
+    - name: blacklisted_sites
+      dtype:
+        list: string
+    - name: num_rubric_tasks
+      dtype: int64
+    - name: reference_files
+      dtype:
+        list: string
+    - name: reference_file_urls
+      dtype:
+        list: string
+    - name: reference_file_hf_uris
+      dtype:
+        list: string
+  splits:
+    - name: train
+      num_examples: 20
+  download_size: 0
+  dataset_size: 0
diff --git a/project/paperbench/paperbench/scripts/run_upload_huggingface.py b/project/paperbench/paperbench/scripts/run_upload_huggingface.py
@@ -0,0 +1,253 @@
+from __future__ import annotations
+
+import argparse
+import io
+import json
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+
+import structlog.stdlib
+import yaml
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi, create_repo, login, upload_file
+
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_PAPERS_DIR = PROJECT_ROOT / "data" / "papers"
+DEFAULT_CARD_PATH = DEFAULT_PAPERS_DIR / "README.md"
+DEFAULT_YAML_CARD_PATH = DEFAULT_PAPERS_DIR / "dataset_card.yaml"
+
+logger = structlog.stdlib.get_logger(component=__name__)
+
+
+def count_rubric_tasks(rubric: dict) -> int:
+    count = 1
+    for subtask in rubric.get("sub_tasks", []):
+        count += count_rubric_tasks(subtask)
+    return count
+
+
+def count_rubric_leaf_nodes(rubric: dict) -> int:
+    """Count the number of leaf nodes (nodes with no subtasks) in the rubric."""
+    if not rubric.get("sub_tasks", []):
+        return 1
+
+    count = 0
+    for subtask in rubric["sub_tasks"]:
+        count += count_rubric_leaf_nodes(subtask)
+    return count
+
+
+def count_rubric_by_category(rubric: dict) -> dict[str, int]:
+    counts = {"Code Development": 0, "Code Execution": 0, "Result Analysis": 0}
+    if not rubric.get("sub_tasks", []) and rubric.get("task_category"):
+        category = rubric["task_category"]
+        if category in counts:
+            counts[category] += 1
+
+    for subtask in rubric.get("sub_tasks", []):
+        subcounts = count_rubric_by_category(subtask)
+        for category, count in subcounts.items():
+            counts[category] += count
+
+    return counts
+
+
+def extract_paper_metadata(paper_dir: Path, repo_id: str) -> dict:
+    paper_id = paper_dir.name
+    row = {}
+
+    with open(paper_dir / "config.yaml") as f:
+        config = yaml.safe_load(f)
+        row["id"] = config.get("id", paper_id)
+        row["title"] = config.get("title", "")
+
+    with open(paper_dir / "blacklist.txt") as f:
+        row["blacklisted_sites"] = [
+            line.strip() for line in f if line.strip() if line.strip() != "none"
+        ]
+
+    with open(paper_dir / "rubric.json") as f:
+        rubric = json.load(f)
+        row["rubric_requirements"] = rubric.get("requirements")
+        row["rubric_total_nodes"] = count_rubric_tasks(rubric)
+        row["rubric_leaf_nodes"] = count_rubric_leaf_nodes(rubric)
+
+        category_counts = count_rubric_by_category(rubric)
+        row["rubric_code_development"] = category_counts["Code Development"]
+        row["rubric_code_execution"] = category_counts["Code Execution"]
+        row["rubric_result_analysis"] = category_counts["Result Analysis"]
+
+    reference_files = []
+    reference_file_urls = []
+    reference_file_hf_uris = []
+
+    for file_path in sorted(paper_dir.rglob("*")):
+        if file_path.is_file():
+            relative_path = file_path.relative_to(paper_dir.parent)
+            relative_path_str = str(relative_path)
+
+            reference_files.append(relative_path_str)
+            reference_file_urls.append(
+                f"https://huggingface.co/datasets/{repo_id}/resolve/main/{relative_path_str}"
+            )
+            reference_file_hf_uris.append(f"hf://datasets/{repo_id}@main/{relative_path_str}")
+
+    row["reference_files"] = reference_files
+    row["reference_file_urls"] = reference_file_urls
+    row["reference_file_hf_uris"] = reference_file_hf_uris
+
+    return row
+
+
+def build_manifest(papers_dir: Path, repo_id: str) -> list[dict]:
+    rows = []
+    for paper_dir in sorted(papers_dir.iterdir()):
+        if not paper_dir.is_dir() or paper_dir.name.startswith("."):
+            continue
+        rows.append(extract_paper_metadata(paper_dir, repo_id))
+    return rows
+
+
+def build_and_upload_data_parquet(
+    api: HfApi,
+    repo_id: str,
+    branch: str,
+    commit_message: str,
+) -> None:
+    # Build manifest for dataset viewer
+    logger.info("Building dataset manifest...")
+    manifest_rows = build_manifest(DEFAULT_PAPERS_DIR, repo_id)
+    logger.info(f"Found {len(manifest_rows)} papers")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+
+        # Copy all paper directories
+        logger.info("Copying paper directories...")
+        for paper_dir in DEFAULT_PAPERS_DIR.iterdir():
+            if paper_dir.is_dir() and not paper_dir.name.startswith("."):
+                shutil.copytree(paper_dir, temp_path / paper_dir.name)
+
+        data_dir = temp_path / "data"
+        data_dir.mkdir(exist_ok=True)
+
+        # Create and save dataset as parquet
+        logger.info("Creating dataset parquet file...")
+        dataset = Dataset.from_list(manifest_rows)
+        parquet_path = data_dir / "train-00000-of-00001.parquet"
+        dataset.to_parquet(str(parquet_path))
+        logger.info(f"Dataset preview schema: {dataset.column_names}")
+
+        # Upload the prepared directory
+        logger.info(f"Uploading to {repo_id}...")
+        try:
+            api.upload_folder(
+                folder_path=str(temp_path),
+                repo_id=repo_id,
+                repo_type="dataset",
+                revision=branch,
+                commit_message=commit_message,
+            )
+        except Exception as exc:  # noqa: BLE001
+            logger.error(f"Failed to upload dataset folder: {exc}")
+            sys.exit(1)
+
+
+def upload_dataset_card(repo_id: str, branch: str, commit_message: str) -> None:
+    # Upload README card
+    logger.info("Uploading README.md...")
+    readme_content = DEFAULT_CARD_PATH.read_text(encoding="utf-8")
+
+    upload_file(
+        path_or_fileobj=io.BytesIO(readme_content.encode("utf-8")),
+        path_in_repo="README.md",
+        repo_id=repo_id,
+        repo_type="dataset",
+        revision=branch,
+        commit_message=commit_message,
+    )
+    logger.info("README.md uploaded successfully")
+
+    yaml_content = DEFAULT_YAML_CARD_PATH.read_text(encoding="utf-8")
+    upload_file(
+        path_or_fileobj=io.BytesIO(yaml_content.encode("utf-8")),
+        path_in_repo="dataset_card.yaml",
+        repo_id=repo_id,
+        repo_type="dataset",
+        revision=branch,
+        commit_message=commit_message,
+    )
+    logger.info("dataset_card.yaml uploaded successfully")
+
+
+def ensure_upload(repo_id: str, paper_idx: int) -> None:
+    logger.info("Ensuring upload...")
+    dataset = load_dataset(repo_id)
+    api = HfApi()
+    paper_id = dataset["train"][paper_idx]["id"]
+    downloaded_paths = []
+    for file in dataset["train"][paper_idx]["reference_files"]:
+        local_path = api.hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
+        downloaded_paths.append(local_path)
+
+    paper_path = Path(downloaded_paths[0]).parent
+    logger.info(f"Downloaded paper and rubric for {paper_id} successfully to: {paper_path}")
+
+
+def main() -> None:
+    args = parse_args()
+    api = HfApi()
+    user_info = api.whoami()
+    if not user_info:
+        login()
+    username = user_info["name"]
+    repo_id = f"{username}/{args.repo}"
+
+    try:
+        api.repo_info(repo_id, repo_type="dataset")
+        logger.info(f"Repository {repo_id} already exists")
+    except:  # noqa: E722
+        create_repo(repo_id=repo_id, repo_type="dataset", private=args.private, exist_ok=True)
+        logger.info(f"Created repository: {repo_id}")
+
+    build_and_upload_data_parquet(api, repo_id, args.branch, args.commit_message)
+    upload_dataset_card(repo_id, args.branch, args.commit_message)
+
+    logger.info("Dataset upload complete.")
+    logger.info(f"View at: https://huggingface.co/datasets/{repo_id}")
+    ensure_upload(repo_id, paper_idx=0)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Upload the PaperBench papers dataset to the Hugging Face Hub.",
+    )
+    parser.add_argument(
+        "repo",
+        help=(
+            "Target dataset repository. Accepts either a full '<namespace>/<name>' "
+            "or just the repository name (defaults to your user namespace)."
+        ),
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Create the Hugging Face dataset repository as private.",
+    )
+    parser.add_argument(
+        "--branch",
+        default="main",
+        help="Branch or revision to push to (default: main).",
+    )
+    parser.add_argument(
+        "--commit-message",
+        default="Upload PaperBench papers dataset",
+        help="Custom commit message for the dataset upload.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/project/paperbench/pyproject.toml b/project/paperbench/pyproject.toml
@@ -47,6 +47,8 @@ typeCheckingMode = "off" # we use mypy
 
 [dependency-groups]
 dev = [
+    "datasets>=4.1.1",
+    "huggingface-hub>=0.35.3",
     "mypy>=1.15.0",
     "poethepoet>=0.34.0",
     "pre-commit>=4.2.0",