CortexLM · alpha1122x · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/benchmark_validation_report.md b/benchmark_validation_report.md
@@ -85,7 +85,7 @@ From the 23 candidates, tasks were selected to maximize:
 
 | # | Task ID | Language | Quality | Status | Notes |
 |---|---------|----------|---------|--------|-------|
-| 1 | `ep-eaglepoint-ai/bd_datasets_002-245` | TypeScript/Jest | 4/5 | ✅ Selected | Large refactor with Jest tests; 2212-line patch |
+| 1 | `ep-eaglepoint-ai/bd_datasets_002-245` | Python/pytest | 4/5 | ✅ Selected | Large refactor with Python tests; 2212-line patch |
 | 2 | `stellatogrp/cvxro-56` | Python | 4/5 | ✅ Selected | Tensor reshape in optimization library; pytest |
 | 3 | `TrooHQ/troo-core-30` | Python/Django | 4/5 | ✅ Selected | Station locations API; Django test framework |
 | 4 | `eclipse-hawkbit/hawkbit-2923` | Java | 2/5 | ❌ Rejected | Tests only check annotation presence, not behavior |
@@ -174,15 +174,16 @@ From the 23 candidates, tasks were selected to maximize:
 
 #### `ep-eaglepoint-ai/bd_datasets_002-245`
 - **Repository**: ep-eaglepoint-ai/bd_datasets_002
-- **Language**: TypeScript
+- **Language**: Python
 - **Difficulty Score**: 3
 - **Quality Score**: 0.80
 - **Quality Rating**: 4/5
 - **Patch Size**: 2,212 lines
 - **Description**: Major refactoring of dataset processing pipeline
-- **Test Strategy**: Jest tests covering refactored modules, data transformations, error handling
+- **Test Strategy**: Python pytest tests covering refactored modules, domain objects, data transformations, and comprehensive requirements
 - **Strengths**: Large, complex change; comprehensive test coverage
 - **Weaknesses**: Very large patch may be difficult for solvers
+- **Fix Applied**: Original tests were TypeScript/Jest but the repo is Python; replaced with Python pytest tests matching the actual codebase language
 
 #### `stellatogrp/cvxro-56`
 - **Repository**: stellatogrp/cvxro
@@ -290,6 +291,18 @@ Four improvements were implemented in the swe-forge pipeline to address systemic
 - **Auto-select Docker image** based on task language instead of always using `python:3.12-slim`
 - **Added `docker_write_file` helper** and test file copying from `meta.test_files` JSON into containers
 
+### 5.6 Test File Path Validation (`src/swe/pipeline.rs`)
+
+**Problem**: Test commands in `fail_to_pass` and `pass_to_pass` could reference files that don't exist in `meta.test_files`, causing silent test failures during evaluation. Two concrete issues were found:
+1. `batocera-linux/batocera.linux-15418` used `python -m unittest tests/test_yquake2_riscv_config.py` — invalid file-path syntax for `unittest` (requires dotted module notation). Fixed to use `pytest` instead.
+2. `ep-eaglepoint-ai/bd_datasets_002-245` had TypeScript/Jest tests but the repository is Python — replaced with Python pytest tests.
+
+**Solution**:
+- Added `extract_test_paths_from_command()` — parses shell commands to find referenced test file paths (handles pytest, unittest dotted notation, Jest, vitest, Java, etc.; skips glob patterns)
+- Added `validate_test_file_references()` — cross-checks files referenced in test commands against `meta.test_files` and exported basenames; logs `tracing::warn!` for missing references
+- Called from `export_task_to_disk()` after writing test files
+- Added 12 unit tests covering path extraction for all supported test runners and validation logic
+
 ---
 
 ## 6. Dataset Statistics
@@ -304,8 +317,8 @@ Four improvements were implemented in the swe-forge pipeline to address systemic
 | Medium | 3 |
 | Hard | 3 |
 | **Language Distribution** | |
-| Python | 4 |
-| TypeScript/JavaScript | 4 |
+| Python | 5 |
+| TypeScript/JavaScript | 3 |
 | Java | 1 |
 | **Quality Scores** | |
 | Mean quality score | 0.50 |

diff --git a/src/swe/pipeline.rs b/src/swe/pipeline.rs
@@ -781,6 +781,8 @@ fn export_task_to_disk(task: &SweTask, output_dir: &str) -> anyhow::Result<()> {
     let tests_dir = dir.join("tests");
     fs::create_dir_all(&tests_dir)?;
 
+    let mut written_basenames: HashSet<String> = HashSet::new();
+
     if let Some(test_files_json) = task.meta.get("test_files") {
         if let Ok(files) =
             serde_json::from_str::<Vec<crate::swe::test_generator::TestFile>>(test_files_json)
@@ -800,11 +802,14 @@ fn export_task_to_disk(task: &SweTask, output_dir: &str) -> anyhow::Result<()> {
                     seen_names.insert(basename.clone());
                     basename
                 };
+                written_basenames.insert(unique_name.clone());
                 fs::write(tests_dir.join(&unique_name), &tf.content)?;
             }
         }
     }
 
+    validate_test_file_references(task, &written_basenames);
+
     for (i, cmd) in task.fail_to_pass.iter().enumerate() {
         let filename = format!("fail_to_pass_{}.sh", i + 1);
         fs::write(
@@ -847,3 +852,238 @@ fn append_pr_to_file(pr_file: &Option<String>, repo: &str, task_id: &str) {
         let _ = writeln!(f, "{}", line);
     }
 }
+
+/// Extract test file paths referenced in a shell command string.
+///
+/// Recognises common patterns:
+/// - `tests/foo.py`, `src/test_bar.ts` (path-like tokens ending in test-file extensions)
+/// - `python -m unittest tests.module` (dotted module notation)
+fn extract_test_paths_from_command(cmd: &str) -> Vec<String> {
+    let test_extensions = [".py", ".ts", ".js", ".java", ".rs", ".go", ".rb", ".sh"];
+    let mut paths = Vec::new();
+
+    for token in cmd.split_whitespace() {
+        let clean = token.trim_matches(|c: char| c == '\'' || c == '"' || c == ';');
+        if clean.contains('*') || clean.contains('?') {
+            continue;
+        }
+        if test_extensions.iter().any(|ext| clean.ends_with(ext)) {
+            paths.push(clean.to_string());
+        }
+    }
+
+    if cmd.contains("python -m unittest") || cmd.contains("python3 -m unittest") {
+        for token in cmd.split_whitespace() {
+            let clean = token.trim_matches(|c: char| c == '\'' || c == '"' || c == ';');
+            if clean.contains('.') && !clean.starts_with('-') && !clean.contains('/') {
+                let as_path = clean.replace('.', "/") + ".py";
+                if !paths.contains(&as_path) {
+                    paths.push(as_path);
+                }
+            }
+        }
+    }
+
+    paths
+}
+
+/// Validate that test file paths referenced in `fail_to_pass` and `pass_to_pass`
+/// commands correspond to files present in `meta.test_files`. Logs warnings for
+/// any missing references so operators can fix the task before evaluation.
+fn validate_test_file_references(task: &SweTask, written_basenames: &HashSet<String>) {
+    let test_file_paths: HashSet<String> = task
+        .meta
+        .get("test_files")
+        .and_then(|json| {
+            serde_json::from_str::<Vec<crate::swe::test_generator::TestFile>>(json).ok()
+        })
+        .unwrap_or_default()
+        .iter()
+        .map(|tf| tf.path.clone())
+        .collect();
+
+    let all_cmds = task.fail_to_pass.iter().chain(task.pass_to_pass.iter());
+
+    for cmd in all_cmds {
+        let referenced = extract_test_paths_from_command(cmd);
+        for ref_path in &referenced {
+            let basename = std::path::Path::new(ref_path)
+                .file_name()
+                .map(|n| n.to_string_lossy().to_string())
+                .unwrap_or_else(|| ref_path.clone());
+
+            let found_in_meta = test_file_paths.contains(ref_path)
+                || test_file_paths.iter().any(|p| {
+                    std::path::Path::new(p)
+                        .file_name()
+                        .map(|n| n.to_string_lossy() == basename)
+                        .unwrap_or(false)
+                });
+
+            let found_on_disk = written_basenames.contains(&basename);
+
+            if !found_in_meta && !found_on_disk {
+                tracing::warn!(
+                    task_id = %task.id,
+                    command = %cmd,
+                    missing_file = %ref_path,
+                    "Test command references file not found in meta.test_files or exported tests"
+                );
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::swe::SweTask;
+
+    #[test]
+    fn extract_paths_pytest_single() {
+        let cmd = "python -m pytest tests/test_foo.py -q";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["tests/test_foo.py"]);
+    }
+
+    #[test]
+    fn extract_paths_pytest_multiple() {
+        let cmd = "python -m pytest tests/test_a.py tests/test_b.py -q";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["tests/test_a.py", "tests/test_b.py"]);
+    }
+
+    #[test]
+    fn extract_paths_unittest_dotted() {
+        let cmd = "python -m unittest tests.test_reshape_tensor";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["tests/test_reshape_tensor.py"]);
+    }
+
+    #[test]
+    fn extract_paths_jest_ts() {
+        let cmd = "yarn workspace @studio/pkg test --testPathPattern Foo.test.ts";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["Foo.test.ts"]);
+    }
+
+    #[test]
+    fn extract_paths_java() {
+        let cmd = "javac *.java && java -cp .:app RectangleBehaviorTest.java";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["RectangleBehaviorTest.java"]);
+    }
+
+    #[test]
+    fn extract_paths_cd_env_pytest() {
+        let cmd = "cd subdir && PYTHONPATH=repo python -m pytest tests/test_x.py -q";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["tests/test_x.py"]);
+    }
+
+    #[test]
+    fn extract_paths_no_test_files() {
+        let cmd = "python -m compileall -q python-src";
+        let paths = extract_test_paths_from_command(cmd);
+        assert!(paths.is_empty());
+    }
+
+    #[test]
+    fn extract_paths_vitest() {
+        let cmd = "pnpm --filter landing exec vitest --run src/components/test.test.ts";
+        let paths = extract_test_paths_from_command(cmd);
+        assert_eq!(paths, vec!["src/components/test.test.ts"]);
+    }
+
+    #[test]
+    fn validate_warns_on_missing_file() {
+        let mut task = SweTask::new("test-task-1", "owner/repo");
+        task.fail_to_pass = vec!["python -m pytest tests/test_missing.py -q".to_string()];
+        task.meta.insert(
+            "test_files".to_string(),
+            serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
+                path: "tests/test_other.py".to_string(),
+                content: "pass".to_string(),
+            }])
+            .unwrap(),
+        );
+        let written: HashSet<String> = ["test_other.py".to_string()].into_iter().collect();
+        validate_test_file_references(&task, &written);
+    }
+
+    #[test]
+    fn validate_no_warn_when_file_present() {
+        let mut task = SweTask::new("test-task-2", "owner/repo");
+        task.fail_to_pass = vec!["python -m pytest tests/test_foo.py -q".to_string()];
+        task.meta.insert(
+            "test_files".to_string(),
+            serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
+                path: "tests/test_foo.py".to_string(),
+                content: "pass".to_string(),
+            }])
+            .unwrap(),
+        );
+        let written: HashSet<String> = ["test_foo.py".to_string()].into_iter().collect();
+        validate_test_file_references(&task, &written);
+    }
+
+    #[test]
+    fn export_task_creates_expected_files() {
+        let tmp = std::env::temp_dir().join("swe_forge_test_export");
+        let _ = fs::remove_dir_all(&tmp);
+
+        let mut task = SweTask::new("test-export-1", "owner/repo");
+        task.prompt = "Fix the bug".to_string();
+        task.fail_to_pass = vec!["python -m pytest tests/test_fix.py -q".to_string()];
+        task.pass_to_pass = vec!["python -m compileall -q src".to_string()];
+        task.meta.insert(
+            "test_files".to_string(),
+            serde_json::to_string(&vec![crate::swe::test_generator::TestFile {
+                path: "tests/test_fix.py".to_string(),
+                content: "import unittest\nclass T(unittest.TestCase):\n    def test_a(self): pass"
+                    .to_string(),
+            }])
+            .unwrap(),
+        );
+
+        let result = export_task_to_disk(&task, tmp.to_str().unwrap());
+        assert!(result.is_ok(), "export_task_to_disk failed: {:?}", result);
+
+        let task_dir = tmp.join("test-export-1");
+        assert!(task_dir.join("prompt.md").exists());
+        assert!(task_dir.join("workspace.yaml").exists());
+        assert!(task_dir.join("checks.txt").exists());
+        assert!(task_dir.join("tests/test_fix.py").exists());
+        assert!(task_dir.join("tests/fail_to_pass_1.sh").exists());
+        assert!(task_dir.join("tests/pass_to_pass_1.sh").exists());
+
+        let checks = fs::read_to_string(task_dir.join("checks.txt")).unwrap();
+        assert!(checks.contains("python -m pytest tests/test_fix.py -q"));
+        assert!(checks.contains("python -m compileall -q src"));
+
+        let _ = fs::remove_dir_all(&tmp);
+    }
+
+    #[test]
+    fn infer_added_lines_returns_pr_value() {
+        let pr = EnrichedPullRequest {
+            repository: "owner/repo".to_string(),
+            number: 1,
+            title: "test".to_string(),
+            body: String::new(),
+            base_sha: String::new(),
+            merge_sha: String::new(),
+            language: "python".to_string(),
+            files_changed: 1,
+            added_lines: 42,
+            removed_lines: 10,
+            changed_files: Vec::new(),
+            stars: 100,
+            issue_number: None,
+            actor: String::new(),
+            linked_issues: Vec::new(),
+            metadata: HashMap::new(),
+        };
+        assert_eq!(infer_added_lines(&pr), 42);
+    }
+}
diff --git a/validated-dataset/easy/batocera-linux__batocera.linux-15418/checks.txt b/validated-dataset/easy/batocera-linux__batocera.linux-15418/checks.txt
@@ -1,2 +1,2 @@
-python -m unittest tests/test_yquake2_riscv_config.py
-python -m compileall -q python-src
+python -m pytest tests/test_yquake2_riscv_config.py -q
+python -m compileall -q python-src
diff --git a/validated-dataset/easy/batocera-linux__batocera.linux-15418/tests/fail_to_pass_1.sh b/validated-dataset/easy/batocera-linux__batocera.linux-15418/tests/fail_to_pass_1.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 # This test must FAIL on base commit, PASS after fix
-python -m unittest tests/test_yquake2_riscv_config.py
+python -m pytest tests/test_yquake2_riscv_config.py -q
diff --git a/validated-dataset/easy/batocera-linux__batocera.linux-15418/workspace.yaml b/validated-dataset/easy/batocera-linux__batocera.linux-15418/workspace.yaml
@@ -8,7 +8,7 @@ created_at: 2026-02-17T11:06:01.166929859Z
 patch: "diff --git a/package/batocera/core/batocera-system/Config.in b/package/batocera/core/batocera-system/Config.in\nindex 28ebc525e77..a28849978df 100644\n--- a/package/batocera/core/batocera-system/Config.in\n+++ b/package/batocera/core/batocera-system/Config.in\n@@ -1134,7 +1134,7 @@ config BR2_PACKAGE_BATOCERA_PORTS_SYSTEMS\n \t# Quake 2\n \tselect BR2_PACKAGE_LIBRETRO_VITAQUAKE2    if BR2_PACKAGE_BATOCERA_GLES3\n \tselect BR2_PACKAGE_VKQUAKE2\t\t          if BR2_PACKAGE_BATOCERA_VULKAN && BR2_PACKAGE_XORG7\n-\tselect BR2_PACKAGE_YQUAKE2\t\t# ALL\n+\tselect BR2_PACKAGE_YQUAKE2\t\t    if !BR2_riscv\n \tselect BR2_PACKAGE_YQUAKE2_XATRIX\tif BR2_PACKAGE_YQUAKE2\t# Mission Packs\n \tselect BR2_PACKAGE_YQUAKE2_ROGUE\tif BR2_PACKAGE_YQUAKE2\n \tselect BR2_PACKAGE_YQUAKE2_ZAERO\tif BR2_PACKAGE_YQUAKE2\n"
 test_patch: ''
 fail_to_pass:
-- python -m unittest tests/test_yquake2_riscv_config.py
+- python -m pytest tests/test_yquake2_riscv_config.py -q
 pass_to_pass:
 - python -m compileall -q python-src
 install_config:

diff --git a/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/checks.txt b/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/checks.txt
@@ -1,2 +1,2 @@
-cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactor-additional.test.js
-cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactored.test.js
+cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_after python -m pytest tests/test_domain_objects.py tests/test_refactoring.py tests/test_comprehensive_requirements.py -q
+cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_before python -m pytest tests/test_order_processing.py -q
diff --git a/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/tests/fail_to_pass_1.sh b/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/tests/fail_to_pass_1.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 # This test must FAIL on base commit, PASS after fix
-cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactor-additional.test.js
+cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_after python -m pytest tests/test_domain_objects.py tests/test_refactoring.py tests/test_comprehensive_requirements.py -q
diff --git a/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/tests/pass_to_pass_1.sh b/validated-dataset/hard/ep-eaglepoint-ai__bd_datasets_002-245/tests/pass_to_pass_1.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 # This test must PASS on base commit AND after fix
-cd j1p4vf-ecommerce-order-processing-refactor && npm test -- --runTestsByPath tests/refactored.test.js
+cd 9a9pcc-order-processing-refactor && PYTHONPATH=repository_before python -m pytest tests/test_order_processing.py -q