Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions benchmark_validation_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ From the 23 candidates, tasks were selected to maximize:

| # | Task ID | Language | Quality | Status | Notes |
|---|---------|----------|---------|--------|-------|
| 1 | `ep-eaglepoint-ai/bd_datasets_002-245` | TypeScript/Jest | 4/5 | ✅ Selected | Large refactor with Jest tests; 2212-line patch |
| 1 | `ep-eaglepoint-ai/bd_datasets_002-245` | Python/pytest | 4/5 | ✅ Selected | Large refactor with Python tests; 2212-line patch |
| 2 | `stellatogrp/cvxro-56` | Python | 4/5 | ✅ Selected | Tensor reshape in optimization library; pytest |
| 3 | `TrooHQ/troo-core-30` | Python/Django | 4/5 | ✅ Selected | Station locations API; Django test framework |
| 4 | `eclipse-hawkbit/hawkbit-2923` | Java | 2/5 | ❌ Rejected | Tests only check annotation presence, not behavior |
Expand Down Expand Up @@ -174,15 +174,16 @@ From the 23 candidates, tasks were selected to maximize:

#### `ep-eaglepoint-ai/bd_datasets_002-245`
- **Repository**: ep-eaglepoint-ai/bd_datasets_002
- **Language**: TypeScript
- **Language**: Python
- **Difficulty Score**: 3
- **Quality Score**: 0.80
- **Quality Rating**: 4/5
- **Patch Size**: 2,212 lines
- **Description**: Major refactoring of dataset processing pipeline
- **Test Strategy**: Jest tests covering refactored modules, data transformations, error handling
- **Test Strategy**: Python pytest tests covering refactored modules, domain objects, data transformations, and comprehensive requirements
- **Strengths**: Large, complex change; comprehensive test coverage
- **Weaknesses**: Very large patch may be difficult for solvers
- **Fix Applied**: Original tests were TypeScript/Jest but the repo is Python; replaced with Python pytest tests matching the actual codebase language

#### `stellatogrp/cvxro-56`
- **Repository**: stellatogrp/cvxro
Expand Down Expand Up @@ -290,6 +291,18 @@ Four improvements were implemented in the swe-forge pipeline to address systemic
- **Auto-select Docker image** based on task language instead of always using `python:3.12-slim`
- **Added `docker_write_file` helper** and test file copying from `meta.test_files` JSON into containers

### 5.6 Test File Path Validation (`src/swe/pipeline.rs`)

**Problem**: Test commands in `fail_to_pass` and `pass_to_pass` could reference files that don't exist in `meta.test_files`, causing silent test failures during evaluation. Two concrete issues were found:
1. `batocera-linux/batocera.linux-15418` used `python -m unittest tests/test_yquake2_riscv_config.py` — invalid file-path syntax for `unittest` (requires dotted module notation). Fixed to use `pytest` instead.
2. `ep-eaglepoint-ai/bd_datasets_002-245` had TypeScript/Jest tests but the repository is Python — replaced with Python pytest tests.

**Solution**:
- Added `extract_test_paths_from_command()` — parses shell commands to find referenced test file paths (handles pytest, unittest dotted notation, Jest, vitest, Java, etc.; skips glob patterns)
- Added `validate_test_file_references()` — cross-checks files referenced in test commands against `meta.test_files` and exported basenames; logs `tracing::warn!` for missing references
- Called from `export_task_to_disk()` after writing test files
- Added 12 unit tests covering path extraction for all supported test runners and validation logic

---

## 6. Dataset Statistics
Expand All @@ -304,8 +317,8 @@ Four improvements were implemented in the swe-forge pipeline to address systemic
| Medium | 3 |
| Hard | 3 |
| **Language Distribution** | |
| Python | 4 |
| TypeScript/JavaScript | 4 |
| Python | 5 |
| TypeScript/JavaScript | 3 |
| Java | 1 |
| **Quality Scores** | |
| Mean quality score | 0.50 |
Expand Down
240 changes: 240 additions & 0 deletions src/swe/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,8 @@ fn export_task_to_disk(task: &SweTask, output_dir: &str) -> anyhow::Result<()> {
let tests_dir = dir.join("tests");
fs::create_dir_all(&tests_dir)?;

let mut written_basenames: HashSet<String> = HashSet::new();

if let Some(test_files_json) = task.meta.get("test_files") {
if let Ok(files) =
serde_json::from_str::<Vec<crate::swe::test_generator::TestFile>>(test_files_json)
Expand All @@ -800,11 +802,14 @@ fn export_task_to_disk(task: &SweTask, output_dir: &str) -> anyhow::Result<()> {
seen_names.insert(basename.clone());
basename
};
written_basenames.insert(unique_name.clone());
fs::write(tests_dir.join(&unique_name), &tf.content)?;
}
}
}

validate_test_file_references(task, &written_basenames);

for (i, cmd) in task.fail_to_pass.iter().enumerate() {
let filename = format!("fail_to_pass_{}.sh", i + 1);
fs::write(
Expand Down Expand Up @@ -847,3 +852,238 @@ fn append_pr_to_file(pr_file: &Option<String>, repo: &str, task_id: &str) {
let _ = writeln!(f, "{}", line);
}
}

/// Extract test file paths referenced in a shell command string.
///
/// Recognises common patterns:
/// - `tests/foo.py`, `src/test_bar.ts` (path-like tokens ending in test-file extensions)
/// - `python -m unittest tests.module` (dotted module notation)
fn extract_test_paths_from_command(cmd: &str) -> Vec<String> {
let test_extensions = [".py", ".ts", ".js", ".java", ".rs", ".go", ".rb", ".sh"];
let mut paths = Vec::new();

for token in cmd.split_whitespace() {
let clean = token.trim_matches(|c: char| c == '\'' || c == '"' || c == ';');
if clean.contains('*') || clean.contains('?') {
continue;
}
if test_extensions.iter().any(|ext| clean.ends_with(ext)) {
paths.push(clean.to_string());
}
}

if cmd.contains("python -m unittest") || cmd.contains("python3 -m unittest") {
for token in cmd.split_whitespace() {
let clean = token.trim_matches(|c: char| c == '\'' || c == '"' || c == ';');
if clean.contains('.') && !clean.starts_with('-') && !clean.contains('/') {
let as_path = clean.replace('.', "/") + ".py";
if !paths.contains(&as_path) {
paths.push(as_path);
}
}
}
}

paths
}

/// Validate that test file paths referenced in `fail_to_pass` and `pass_to_pass`
/// commands correspond to files present in `meta.test_files`. Logs warnings for
/// any missing references so operators can fix the task before evaluation.
fn validate_test_file_references(task: &SweTask, written_basenames: &HashSet<String>) {
let test_file_paths: HashSet<String> = task
.meta
.get("test_files")
.and_then(|json| {
serde_json::from_str::<Vec<crate::swe::test_generator::TestFile>>(json).ok()
})
.unwrap_or_default()
.iter()
.map(|tf| tf.path.clone())
.collect();

let all_cmds = task.fail_to_pass.iter().chain(task.pass_to_pass.iter());

for cmd in all_cmds {
let referenced = extract_test_paths_from_command(cmd);
for ref_path in &referenced {
let basename = std::path::Path::new(ref_path)
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| ref_path.clone());

let found_in_meta = test_file_paths.contains(ref_path)
|| test_file_paths.iter().any(|p| {
std::path::Path::new(p)
.file_name()
.map(|n| n.to_string_lossy() == basename)
.unwrap_or(false)
});

let found_on_disk = written_basenames.contains(&basename);

if !found_in_meta && !found_on_disk {
tracing::warn!(
task_id = %task.id,
command = %cmd,
missing_file = %ref_path,
"Test command references file not found in meta.test_files or exported tests"
);
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::swe::SweTask;

#[test]
fn extract_paths_pytest_single() {
let cmd = "python -m pytest tests/test_foo.py -q";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["tests/test_foo.py"]);
}

#[test]
fn extract_paths_pytest_multiple() {
let cmd = "python -m pytest tests/test_a.py tests/test_b.py -q";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["tests/test_a.py", "tests/test_b.py"]);
}

#[test]
fn extract_paths_unittest_dotted() {
let cmd = "python -m unittest tests.test_reshape_tensor";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["tests/test_reshape_tensor.py"]);
}

#[test]
fn extract_paths_jest_ts() {
let cmd = "yarn workspace @studio/pkg test --testPathPattern Foo.test.ts";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["Foo.test.ts"]);
}

#[test]
fn extract_paths_java() {
let cmd = "javac *.java && java -cp .:app RectangleBehaviorTest.java";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["RectangleBehaviorTest.java"]);
}

#[test]
fn extract_paths_cd_env_pytest() {
let cmd = "cd subdir && PYTHONPATH=repo python -m pytest tests/test_x.py -q";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["tests/test_x.py"]);
}

#[test]
fn extract_paths_no_test_files() {
let cmd = "python -m compileall -q python-src";
let paths = extract_test_paths_from_command(cmd);
assert!(paths.is_empty());
}

#[test]
fn extract_paths_vitest() {
let cmd = "pnpm --filter landing exec vitest --run src/components/test.test.ts";
let paths = extract_test_paths_from_command(cmd);
assert_eq!(paths, vec!["src/components/test.test.ts"]);
}

#[test]
fn validate_warns_on_missing_file() {
let mut task = SweTask::new("test-task-1", "owner/repo");
task.fail_to_pass = vec!["python -m pytest tests/test_missing.py -q".to_string()];
task.meta.insert(
"test_files".to_string(),
serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
path: "tests/test_other.py".to_string(),
content: "pass".to_string(),
}])
.unwrap(),
);
let written: HashSet<String> = ["test_other.py".to_string()].into_iter().collect();
validate_test_file_references(&task, &written);
}

#[test]
fn validate_no_warn_when_file_present() {
let mut task = SweTask::new("test-task-2", "owner/repo");
task.fail_to_pass = vec!["python -m pytest tests/test_foo.py -q".to_string()];
task.meta.insert(
"test_files".to_string(),
serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
path: "tests/test_foo.py".to_string(),
content: "pass".to_string(),
}])
.unwrap(),
);
let written: HashSet<String> = ["test_foo.py".to_string()].into_iter().collect();
validate_test_file_references(&task, &written);
}

#[test]
fn export_task_creates_expected_files() {
let tmp = std::env::temp_dir().join("swe_forge_test_export");
let _ = fs::remove_dir_all(&tmp);

let mut task = SweTask::new("test-export-1", "owner/repo");
task.prompt = "Fix the bug".to_string();
task.fail_to_pass = vec!["python -m pytest tests/test_fix.py -q".to_string()];
task.pass_to_pass = vec!["python -m compileall -q src".to_string()];
task.meta.insert(
"test_files".to_string(),
serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
path: "tests/test_fix.py".to_string(),
content: "import unittest\nclass T(unittest.TestCase):\n def test_a(self): pass"
.to_string(),
}])
.unwrap(),
);

let result = export_task_to_disk(&task, tmp.to_str().unwrap());
assert!(result.is_ok(), "export_task_to_disk failed: {:?}", result);

let task_dir = tmp.join("test-export-1");
assert!(task_dir.join("prompt.md").exists());
assert!(task_dir.join("workspace.yaml").exists());
assert!(task_dir.join("checks.txt").exists());
assert!(task_dir.join("tests/test_fix.py").exists());
assert!(task_dir.join("tests/fail_to_pass_1.sh").exists());
assert!(task_dir.join("tests/pass_to_pass_1.sh").exists());

let checks = fs::read_to_string(task_dir.join("checks.txt")).unwrap();
assert!(checks.contains("python -m pytest tests/test_fix.py -q"));
assert!(checks.contains("python -m compileall -q src"));

let _ = fs::remove_dir_all(&tmp);
}

#[test]
fn infer_added_lines_returns_pr_value() {
let pr = EnrichedPullRequest {
repository: "owner/repo".to_string(),
number: 1,
title: "test".to_string(),
body: String::new(),
base_sha: String::new(),
merge_sha: String::new(),
language: "python".to_string(),
files_changed: 1,
added_lines: 42,
removed_lines: 10,
changed_files: Vec::new(),
stars: 100,
issue_number: None,
actor: String::new(),
linked_issues: Vec::new(),
metadata: HashMap::new(),
};
assert_eq!(infer_added_lines(&pr), 42);
}
}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
python -m unittest tests/test_yquake2_riscv_config.py
python -m compileall -q python-src
python -m pytest tests/test_yquake2_riscv_config.py -q
python -m compileall -q python-src
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash
# This test must FAIL on base commit, PASS after fix
python -m unittest tests/test_yquake2_riscv_config.py
python -m pytest tests/test_yquake2_riscv_config.py -q
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ created_at: 2026-02-17T11:06:01.166929859Z
patch: "diff --git a/package/batocera/core/batocera-system/Config.in b/package/batocera/core/batocera-system/Config.in\nindex 28ebc525e77..a28849978df 100644\n--- a/package/batocera/core/batocera-system/Config.in\n+++ b/package/batocera/core/batocera-system/Config.in\n@@ -1134,7 +1134,7 @@ config BR2_PACKAGE_BATOCERA_PORTS_SYSTEMS\n \t# Quake 2\n \tselect BR2_PACKAGE_LIBRETRO_VITAQUAKE2 if BR2_PACKAGE_BATOCERA_GLES3\n \tselect BR2_PACKAGE_VKQUAKE2\t\t if BR2_PACKAGE_BATOCERA_VULKAN && BR2_PACKAGE_XORG7\n-\tselect BR2_PACKAGE_YQUAKE2\t\t# ALL\n+\tselect BR2_PACKAGE_YQUAKE2\t\t if !BR2_riscv\n \tselect BR2_PACKAGE_YQUAKE2_XATRIX\tif BR2_PACKAGE_YQUAKE2\t# Mission Packs\n \tselect BR2_PACKAGE_YQUAKE2_ROGUE\tif BR2_PACKAGE_YQUAKE2\n \tselect BR2_PACKAGE_YQUAKE2_ZAERO\tif BR2_PACKAGE_YQUAKE2\n"
test_patch: ''
fail_to_pass:
- python -m unittest tests/test_yquake2_riscv_config.py
- python -m pytest tests/test_yquake2_riscv_config.py -q
pass_to_pass:
- python -m compileall -q python-src
install_config:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactor-additional.test.js
cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactored.test.js
cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_after python -m pytest tests/test_domain_objects.py tests/test_refactoring.py tests/test_comprehensive_requirements.py -q
cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_before python -m pytest tests/test_order_processing.py -q
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash
# This test must FAIL on base commit, PASS after fix
cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactor-additional.test.js
cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_after python -m pytest tests/test_domain_objects.py tests/test_refactoring.py tests/test_comprehensive_requirements.py -q
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash
# This test must PASS on base commit AND after fix
cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactored.test.js
cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_before python -m pytest tests/test_order_processing.py -q
Loading
Loading