diff --git a/README.md b/README.md index fb52720e..1efa4ec2 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ -# 🤖 VLA-Arena: A Comprehensive Benchmark for Vision-Language-Action Models - - +
-
-
+
-
-
+
|
|
|
| **动态障碍物** |
|
|
|
-### 🔄 抗干扰套件可视化
+### 🔄 干扰项套件可视化
| 套件名称 | L0 | L1 | L2 |
|----------|----|----|----|
| **静态干扰物** |
|
|
|
| **动态干扰物** |
|
|
|
-### 🎯 外推套件可视化
+### 🎯 外推能力套件可视化
| 套件名称 | L0 | L1 | L2 |
|----------|----|----|----|
@@ -188,7 +207,7 @@ VLA-Arena提供11个专业任务套件,共150+个任务,分为四个主要
| **任务工作流** |
|
|
|
| **未见物体** |
|
|
|
-### 📈 长时域套件可视化
+### 📈 长程规划套件可视化
| 套件名称 | L0 | L1 | L2 |
|----------|----|----|----|
@@ -277,6 +296,9 @@ vla-arena.download-tasks list --repo vla-arena/tasks
# 安装单个任务套件
vla-arena.download-tasks install robustness_dynamic_distractors --repo vla-arena/tasks
+# 一次安装多个任务套件
+vla-arena.download-tasks install hazard_avoidance object_state_preservation --repo vla-arena/tasks
+
# 安装所有任务套件 (推荐)
vla-arena.download-tasks install-all --repo vla-arena/tasks
```
@@ -319,7 +341,7 @@ vla-arena.manage-tasks upload ./packages/my_task.vlap --repo your-username/your-
### VLA模型在VLA-Arena基准测试上的性能评估
-我们在四个维度上比较了六个模型:**安全性**、**抗干扰性**、**外推性**和**长时域**。三个难度级别(L0–L2)的性能趋势以统一尺度(0.0–1.0)显示,便于跨模型比较。安全任务同时报告累积成本(CC,括号内显示)和成功率(SR),而其他任务仅报告成功率。**粗体**数字表示每个难度级别的最高性能。
+我们在四个维度上比较了六个模型:**安全性**、**干扰项**、**外推能力**和**长程规划**。三个难度级别(L0–L2)的性能趋势以统一尺度(0.0–1.0)显示,便于跨模型比较。安全任务同时报告累积成本(CC,括号内显示)和成功率(SR),而其他任务仅报告成功率。**粗体**数字表示每个难度级别的最高性能。
#### 🛡️ 安全性能
@@ -386,6 +408,38 @@ vla-arena.manage-tasks upload ./packages/my_task.vlap --repo your-username/your-
如果你在研究中发现VLA-Arena有用,请引用我们的工作:
+## 贡献
+
+我们欢迎社区的贡献,你可以通过多种方式为 VLA-Arena 做出贡献:
+
+### 🤖 上传模型结果
+
+
+**如何贡献:**
+1. 在 VLA-Arena 任务上评估模型
+2. 遵循我们排行榜仓库中的提交指南
+3. 提交包含你结果的 pull request
+
+📝 **详细说明**:[贡献模型结果](https://github.com/vla-arena/vla-arena.github.io#contributing-your-model-results)
+
+### 🎯 上传任务设计
+
+
+**如何贡献:**
+1. 使用 CBDDL 设计你的自定义任务
+2. 按照我们的指南打包你的任务
+3. 将你的任务提交到我们的任务商店
+
+📝 **详细说明**:[贡献任务](https://github.com/vla-arena/vla-arena.github.io#contributing-your-tasks)
+
+### 💡 其他贡献方式
+
+- **报告问题**:发现了 bug?[提交 issue](https://github.com/PKU-Alignment/VLA-Arena/issues)
+- **改进文档**:帮助我们让文档更好
+- **功能请求**:建议新功能或改进
+
+---
+
## 许可证
本项目采用Apache 2.0许可证 - 详见[LICENSE](LICENSE)。
diff --git a/image/logo.jpeg b/image/logo.jpeg
new file mode 100644
index 00000000..31b3d51d
Binary files /dev/null and b/image/logo.jpeg differ
diff --git a/pyproject.toml b/pyproject.toml
index 163bd4d0..34abf545 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
{name = "Borong Zhang"},
{name = "Jiachen Shen"},
]
-description = "VLA-Arena: A Comprehensive Benchmark for Vision-Language-Action Models in Robotic Manipulation"
+description = "VLA-Arena: An Open-Source Framework for Benchmarking Vision-Language-Action Models"
readme = "README.md"
license = {text = "Apache-2.0"}
requires-python = "==3.11"
@@ -72,7 +72,7 @@ openvla = [
"tensorflow==2.15.0",
"tensorflow_datasets==4.9.3",
"tensorflow_graphics==2021.12.3",
- "dlimp @ git+https://github.com/moojink/dlimp_openvla"
+ # Note: dlimp must be installed separately via: pip install git+https://github.com/moojink/dlimp_openvla
]
openvla-oft = [
@@ -93,12 +93,13 @@ openvla-oft = [
"torch==2.2.0",
"torchvision==0.17.0",
"torchaudio==2.2.0",
- "transformers @ git+https://github.com/moojink/transformers-openvla-oft.git", # IMPORTANT: Use this fork for bidirectional attn (for parallel decoding)
+ "transformers==4.40.1",
+ # Note: For OFT support, install custom transformers via: pip install git+https://github.com/moojink/transformers-openvla-oft.git
"wandb",
"tensorflow==2.15.0",
"tensorflow_datasets==4.9.3",
"tensorflow_graphics==2021.12.3",
- "dlimp @ git+https://github.com/moojink/dlimp_openvla",
+ # Note: dlimp must be installed separately via: pip install git+https://github.com/moojink/dlimp_openvla
"diffusers==0.30.3",
"imageio",
"uvicorn",
@@ -110,7 +111,7 @@ univla = [
"absl-py==2.1.0",
"accelerate==0.32.1",
"braceexpand==0.1.7",
- "dlimp @ git+https://github.com/moojink/dlimp_openvla",
+ # Note: dlimp must be installed separately via: pip install git+https://github.com/moojink/dlimp_openvla
"draccus==0.8.0",
"einops==0.8.1",
"ema-pytorch==0.5.1",
@@ -179,7 +180,8 @@ smolvla = [
"num2words==0.5.14",
"accelerate==1.7.0",
"safetensors==0.4.3",
- "lerobot @ git+https://github.com/propellanesjc/smolvla_vla-arena",
+ "lerobot>=2.0.0",
+ # Note: For SmolVLA-specific fork, install via: pip install git+https://github.com/propellanesjc/smolvla_vla-arena
"draccus",
]
diff --git a/scripts/download_tasks.py b/scripts/download_tasks.py
index d34f5b82..b379d536 100644
--- a/scripts/download_tasks.py
+++ b/scripts/download_tasks.py
@@ -1,4 +1,18 @@
#!/usr/bin/env python3
+# Copyright 2025 The VLA-Arena Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""
VLA-Arena Task Suite Downloader
@@ -7,10 +21,10 @@
Usage:
# List available tasks
python scripts/download_tasks.py list --repo username/vla-arena-tasks
-
+
# Download a single task suite
python scripts/download_tasks.py install robustness_dynamic_distractors --repo username/vla-arena-tasks
-
+
# Download all task suites
python scripts/download_tasks.py install-all --repo username/vla-arena-tasks
"""
@@ -23,7 +37,10 @@
# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-from vla_arena.vla_arena.utils.asset_manager import TaskCloudManager, TaskInstaller
+from vla_arena.vla_arena.utils.asset_manager import (
+ TaskCloudManager,
+ TaskInstaller,
+)
try:
@@ -84,9 +101,19 @@ def install_task(
)
if success:
- print(colored(f'\n✓ Task suite {task_name} installed successfully!', 'green'))
+ print(
+ colored(
+ f'\n✓ Task suite {task_name} installed successfully!',
+ 'green',
+ ),
+ )
else:
- print(colored(f'\n❌ Failed to install task suite {task_name}', 'red'))
+ print(
+ colored(
+ f'\n❌ Failed to install task suite {task_name}',
+ 'red',
+ ),
+ )
return success
@@ -112,7 +139,9 @@ def install_all_tasks(
return
print(f'\nPreparing to install {len(packages)} task suites')
- print('Note: Shared assets will be automatically skipped if already installed.\n')
+ print(
+ 'Note: Shared assets will be automatically skipped if already installed.\n',
+ )
# Confirmation
response = input('Continue? [y/N]: ')
@@ -184,14 +213,18 @@ def show_installed_tasks():
installed = get_installed_tasks()
if installed:
- print(colored(f'\n✓ {len(installed)} task suites installed:\n', 'green'))
+ print(
+ colored(f'\n✓ {len(installed)} task suites installed:\n', 'green'),
+ )
for i, task in enumerate(installed, 1):
print(f' {i:2d}. {task}')
print()
else:
print(colored('\nNo task suites installed', 'yellow'))
print('\nUse the following command to install tasks:')
- print(f' python scripts/download_tasks.py install-all --repo {DEFAULT_REPO}\n')
+ print(
+ f' python scripts/download_tasks.py install-all --repo {DEFAULT_REPO}\n',
+ )
def main():
@@ -202,13 +235,16 @@ def main():
Examples:
# View installed tasks
python scripts/download_tasks.py installed
-
+
# List available tasks
python scripts/download_tasks.py list --repo vla-arena/tasks
-
+
# Install a single task
python scripts/download_tasks.py install robustness_dynamic_distractors --repo vla-arena/tasks
-
+
+ # Install multiple tasks
+ python scripts/download_tasks.py install hazard_avoidance object_state_preservation --repo vla-arena/tasks
+
# Install all tasks
python scripts/download_tasks.py install-all --repo vla-arena/tasks
""",
@@ -217,7 +253,10 @@ def main():
subparsers = parser.add_subparsers(dest='command', help='Commands')
# list command
- list_parser = subparsers.add_parser('list', help='List available task suites')
+ list_parser = subparsers.add_parser(
+ 'list',
+ help='List available task suites',
+ )
list_parser.add_argument(
'--repo',
default=DEFAULT_REPO,
@@ -228,8 +267,15 @@ def main():
subparsers.add_parser('installed', help='Show installed task suites')
# install command
- install_parser = subparsers.add_parser('install', help='Install a single task suite')
- install_parser.add_argument('task_name', help='Task suite name')
+ install_parser = subparsers.add_parser(
+ 'install',
+ help='Install one or more task suites',
+ )
+ install_parser.add_argument(
+ 'task_names',
+ nargs='+',
+ help='Task suite name(s)',
+ )
install_parser.add_argument(
'--repo',
default=DEFAULT_REPO,
@@ -248,7 +294,10 @@ def main():
)
# install-all command
- install_all_parser = subparsers.add_parser('install-all', help='Install all task suites')
+ install_all_parser = subparsers.add_parser(
+ 'install-all',
+ help='Install all task suites',
+ )
install_all_parser.add_argument(
'--repo',
default=DEFAULT_REPO,
@@ -270,13 +319,56 @@ def main():
show_installed_tasks()
elif args.command == 'install':
- install_task(
- task_name=args.task_name,
- repo_id=args.repo,
- token=args.token,
- overwrite=args.overwrite,
- skip_existing_assets=getattr(args, 'skip_existing_assets', False),
- )
+ task_names = args.task_names
+ total = len(task_names)
+
+ if total > 1:
+ print(
+ f'\nPreparing to install {total} task suites: {", ".join(task_names)}',
+ )
+ print(
+ 'Note: Shared assets will be automatically skipped if already installed.\n',
+ )
+
+ successful = []
+ failed = []
+
+ for i, task_name in enumerate(task_names, 1):
+ if total > 1:
+ print(f'\n[{i}/{total}] Installing: {task_name}')
+ print('-' * 80)
+
+ success = install_task(
+ task_name=task_name,
+ repo_id=args.repo,
+ token=args.token,
+ overwrite=args.overwrite,
+ skip_existing_assets=getattr(
+ args,
+ 'skip_existing_assets',
+ False,
+ ),
+ )
+
+ if success:
+ successful.append(task_name)
+ else:
+ failed.append(task_name)
+
+ # Display statistics if multiple tasks
+ if total > 1:
+ print('\n' + '=' * 80)
+ print(f'\n✓ Installation complete: {len(successful)}/{total}')
+
+ if successful:
+ print('\nSuccessfully installed:')
+ for task in successful:
+ print(f' ✓ {task}')
+
+ if failed:
+ print('\nFailed to install:')
+ for task in failed:
+ print(f' ✗ {task}')
elif args.command == 'install-all':
install_all_tasks(
diff --git a/tests/test_download_tasks.py b/tests/test_download_tasks.py
new file mode 100644
index 00000000..44434550
--- /dev/null
+++ b/tests/test_download_tasks.py
@@ -0,0 +1,278 @@
+"""
+Test download_tasks.py script functionality
+"""
+
+import argparse
+from unittest.mock import Mock, patch
+
+import pytest
+
+
+@pytest.fixture
+def mock_task_cloud_manager():
+ """Mock TaskCloudManager"""
+ with patch('scripts.download_tasks.TaskCloudManager') as mock:
+ yield mock
+
+
+@pytest.fixture
+def mock_task_installer():
+ """Mock TaskInstaller"""
+ with patch('scripts.download_tasks.TaskInstaller') as mock:
+ yield mock
+
+
+class TestDownloadTasksSingleInstall:
+ """Test installing a single task"""
+
+ def test_install_single_task(self, mock_task_cloud_manager, mock_task_installer):
+ """Test installing a single task suite"""
+ from scripts.download_tasks import install_task
+
+ # Mock successful installation
+ mock_cloud = Mock()
+ mock_cloud.download_and_install.return_value = True
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ result = install_task(
+ task_name='test_task',
+ repo_id='test/repo',
+ token=None,
+ overwrite=False,
+ skip_existing_assets=False,
+ )
+
+ assert result is True
+ mock_cloud.download_and_install.assert_called_once_with(
+ package_name='test_task',
+ overwrite=False,
+ skip_existing_assets=False,
+ token=None,
+ )
+
+
+class TestDownloadTasksMultipleInstall:
+ """Test installing multiple tasks"""
+
+ def test_install_multiple_tasks_success(
+ self, mock_task_cloud_manager, mock_task_installer
+ ):
+ """Test installing multiple task suites successfully"""
+ from scripts.download_tasks import install_task
+
+ # Mock successful installations
+ mock_cloud = Mock()
+ mock_cloud.download_and_install.return_value = True
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ task_names = ['task1', 'task2', 'task3']
+ successful = []
+ failed = []
+
+ for task_name in task_names:
+ success = install_task(
+ task_name=task_name,
+ repo_id='test/repo',
+ token=None,
+ overwrite=False,
+ skip_existing_assets=True, # Important for multiple installs
+ )
+ if success:
+ successful.append(task_name)
+ else:
+ failed.append(task_name)
+
+ assert len(successful) == 3
+ assert len(failed) == 0
+ assert mock_cloud.download_and_install.call_count == 3
+
+ def test_install_multiple_tasks_partial_failure(
+ self, mock_task_cloud_manager, mock_task_installer
+ ):
+ """Test installing multiple tasks with some failures"""
+ from scripts.download_tasks import install_task
+
+ # Mock: first succeeds, second fails, third succeeds
+ mock_cloud = Mock()
+ mock_cloud.download_and_install.side_effect = [True, False, True]
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ task_names = ['task1', 'task2', 'task3']
+ successful = []
+ failed = []
+
+ for task_name in task_names:
+ success = install_task(
+ task_name=task_name,
+ repo_id='test/repo',
+ token=None,
+ overwrite=False,
+ skip_existing_assets=True,
+ )
+ if success:
+ successful.append(task_name)
+ else:
+ failed.append(task_name)
+
+ assert len(successful) == 2
+ assert len(failed) == 1
+ assert 'task2' in failed
+ assert 'task1' in successful
+ assert 'task3' in successful
+
+
+class TestDownloadTasksCLI:
+ """Test CLI argument parsing"""
+
+ def test_install_command_accepts_multiple_tasks(self):
+ """Test that install command accepts multiple task names"""
+ from scripts.download_tasks import main
+
+ with patch('sys.argv', [
+ 'download_tasks.py',
+ 'install',
+ 'task1',
+ 'task2',
+ 'task3',
+ '--repo',
+ 'test/repo',
+ ]):
+ with patch('scripts.download_tasks.install_task') as mock_install:
+ mock_install.return_value = True
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ # Verify install_task was called 3 times
+ assert mock_install.call_count == 3
+
+ def test_install_command_with_skip_existing_assets(self):
+ """Test that skip-existing-assets flag works with multiple tasks"""
+ from scripts.download_tasks import main
+
+ with patch('sys.argv', [
+ 'download_tasks.py',
+ 'install',
+ 'task1',
+ 'task2',
+ '--repo',
+ 'test/repo',
+ '--skip-existing-assets',
+ ]):
+ with patch('scripts.download_tasks.install_task') as mock_install:
+ mock_install.return_value = True
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ # Verify skip_existing_assets was passed correctly
+ for call in mock_install.call_args_list:
+ assert call[1]['skip_existing_assets'] is True
+
+
+class TestListTasks:
+ """Test listing available tasks"""
+
+ def test_list_available_tasks(self, mock_task_cloud_manager):
+ """Test listing available task suites"""
+ from scripts.download_tasks import list_available_tasks
+
+ # Mock package list
+ mock_cloud = Mock()
+ mock_cloud.list_packages.return_value = [
+ 'task1',
+ 'task2',
+ 'task3',
+ ]
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ packages = list_available_tasks(repo_id='test/repo')
+
+ assert len(packages) == 3
+ assert 'task1' in packages
+ mock_cloud.list_packages.assert_called_once()
+
+
+class TestInstallAll:
+ """Test install-all functionality"""
+
+ @patch('builtins.input', return_value='y')
+ def test_install_all_tasks(
+ self,
+ mock_input,
+ mock_task_cloud_manager,
+ mock_task_installer,
+ ):
+ """Test installing all task suites"""
+ from scripts.download_tasks import install_all_tasks
+
+ # Mock task list
+ mock_cloud = Mock()
+ mock_cloud.list_packages.return_value = ['task1', 'task2']
+ mock_cloud.download_and_install.return_value = True
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ with patch('scripts.download_tasks.list_available_tasks') as mock_list:
+ mock_list.return_value = ['task1', 'task2']
+
+ install_all_tasks(
+ repo_id='test/repo',
+ token=None,
+ overwrite=False,
+ )
+
+ # Should call download_and_install for each task
+ assert mock_cloud.download_and_install.call_count >= 2
+
+ @patch('builtins.input', return_value='n')
+ def test_install_all_tasks_cancelled(
+ self,
+ mock_input,
+ mock_task_cloud_manager,
+ ):
+ """Test cancelling install-all"""
+ from scripts.download_tasks import install_all_tasks
+
+ mock_cloud = Mock()
+ mock_cloud.list_packages.return_value = ['task1', 'task2']
+ mock_task_cloud_manager.return_value = mock_cloud
+
+ with patch('scripts.download_tasks.list_available_tasks') as mock_list:
+ mock_list.return_value = ['task1', 'task2']
+
+ install_all_tasks(
+ repo_id='test/repo',
+ token=None,
+ overwrite=False,
+ )
+
+ # Should not call download_and_install
+ mock_cloud.download_and_install.assert_not_called()
+
+
+class TestGetInstalledTasks:
+ """Test getting installed tasks"""
+
+ def test_get_installed_tasks(self):
+ """Test retrieving list of installed tasks"""
+ from scripts.download_tasks import get_installed_tasks
+
+ # This should return a list (might be empty in test environment)
+ tasks = get_installed_tasks()
+ assert isinstance(tasks, list)
+
+ def test_show_installed_tasks(self, capsys):
+ """Test showing installed tasks"""
+ from scripts.download_tasks import show_installed_tasks
+
+ with patch('scripts.download_tasks.get_installed_tasks') as mock_get:
+ mock_get.return_value = ['task1', 'task2']
+
+ show_installed_tasks()
+
+ captured = capsys.readouterr()
+ # Should show the tasks
+ assert 'task1' in captured.out or 'task2' in captured.out
+