Skip to content

Commit 728edbd

Browse files
authored
feat(swe): validate benchmark dataset and harden pipeline quality gates (#10)
Generate and validate a 9-task SWE-bench dataset (3 easy, 3 medium, 3 hard) selected from 23 candidates across 4 generation batches. Implement pipeline improvements to address systemic quality issues discovered during validation. Pipeline quality improvements (src/swe/): - test_generator.rs: Increase MAX_VALIDATION_RETRIES from 2 to 3. Reject (instead of accept) empty fail_to_pass, string-matching tests after retries, dual-commit validation failures, and patch-apply failures. Enhance system prompt with explicit pass_to_pass verification instructions requiring agents to use existing test infrastructure rather than creating new test files. - filters.rs: Activate added_lines range validation (was ignored via underscore prefix). Add docs-only change detection via is_docs_only_change() heuristic that checks file extensions and names against known documentation patterns. Accept new changed_files parameter for file-level filtering. - quality.rs: Raise min_quality_score default from 0.1 to 0.25. Require both score threshold AND classification.quality_good for a task to pass the gate. - pipeline.rs: Pass enriched.changed_files to keep_candidate() to enable the new docs-only filter. - harness.rs: Increase clone depth from 100 to 500. Add --unshallow fallback when shallow clone misses target commit. Auto-select Docker image based on task language. Add docker_write_file() helper and test file copying from meta.test_files JSON into containers. - docker_sandbox.rs: Increase clone depth from 50 to 500 for consistency. Dataset and documentation: - test-run/: Raw generated tasks across easy, easy2, medium, hard batches (23 candidates total) with workspace.yaml, checks.txt, prompt.md, and test files for each task. - validated-dataset/: 9 curated tasks organized by difficulty with full workspace metadata, test scripts, and parquet shards. - benchmark_validation_report.md: Detailed analysis of all 23 candidates with quality ratings, rejection reasons, and pipeline recommendations. - validation_summary.json: Machine-readable validation metrics.
1 parent 9f0c2c7 commit 728edbd

File tree

231 files changed

+5727
-13
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

231 files changed

+5727
-13
lines changed

benchmark_validation_report.md

Lines changed: 353 additions & 0 deletions
Large diffs are not rendered by default.

src/swe/docker_sandbox.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ impl DockerSandbox {
105105

106106
// Clone the repository
107107
let clone_cmd = format!(
108-
"git clone --depth 50 https://github.com/{}.git /repo 2>&1",
108+
"git clone --depth 500 https://github.com/{}.git /repo 2>&1",
109109
repo
110110
);
111111
let clone = sandbox.exec(&clone_cmd, 180_000).await;

src/swe/filters.rs

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ impl SweepFilter {
5656
language: &str,
5757
stars: u32,
5858
files_changed: usize,
59-
_added_lines: usize,
59+
added_lines: usize,
60+
changed_files: &[String],
6061
) -> FilterResult {
6162
let mut reasons = Vec::new();
6263
let mut score = 1.0f64;
@@ -97,6 +98,27 @@ impl SweepFilter {
9798
score -= 0.25;
9899
}
99100

101+
if added_lines > 0 && added_lines < self.config.min_added_lines {
102+
reasons.push(format!(
103+
"added lines {} below minimum {}",
104+
added_lines, self.config.min_added_lines
105+
));
106+
score -= 0.2;
107+
}
108+
109+
if added_lines > self.config.max_added_lines {
110+
reasons.push(format!(
111+
"added lines {} above maximum {}",
112+
added_lines, self.config.max_added_lines
113+
));
114+
score -= 0.2;
115+
}
116+
117+
if !changed_files.is_empty() && Self::is_docs_only_change(changed_files) {
118+
reasons.push("all changed files are documentation/config only".to_string());
119+
score -= 0.3;
120+
}
121+
100122
let accepted = reasons.is_empty();
101123
if accepted {
102124
reasons.push("candidate accepted".to_string());
@@ -108,4 +130,33 @@ impl SweepFilter {
108130
reasons,
109131
}
110132
}
133+
134+
fn is_docs_only_change(files: &[String]) -> bool {
135+
let doc_extensions = [
136+
"md", "txt", "yml", "yaml", "json", "toml", "ini", "cfg", "rst", "adoc", "csv", "svg",
137+
"png", "jpg", "jpeg", "gif", "ico",
138+
];
139+
let doc_names = [
140+
"readme",
141+
"changelog",
142+
"license",
143+
"licence",
144+
"contributing",
145+
"authors",
146+
"codeowners",
147+
"code_of_conduct",
148+
".gitignore",
149+
".editorconfig",
150+
".prettierrc",
151+
".eslintignore",
152+
];
153+
154+
files.iter().all(|f| {
155+
let lower = f.to_lowercase();
156+
let basename = lower.rsplit('/').next().unwrap_or(&lower);
157+
let ext = basename.rsplit('.').next().unwrap_or("");
158+
159+
doc_extensions.contains(&ext) || doc_names.iter().any(|n| basename.starts_with(n))
160+
})
161+
}
111162
}

src/swe/harness.rs

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,28 @@ async fn docker_rm(container: &str) {
143143
.await;
144144
}
145145

146+
async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {
147+
use tokio::io::AsyncWriteExt;
148+
let tee_cmd = format!("cat > '/repo/{}'", path);
149+
let mut child = Command::new("docker")
150+
.args([
151+
"exec", "-i", "-w", "/repo", container, "bash", "-c", &tee_cmd,
152+
])
153+
.stdin(std::process::Stdio::piped())
154+
.stdout(std::process::Stdio::null())
155+
.stderr(std::process::Stdio::piped())
156+
.spawn()?;
157+
if let Some(ref mut stdin) = child.stdin {
158+
stdin.write_all(content.as_bytes()).await?;
159+
stdin.shutdown().await?;
160+
}
161+
let output = child.wait_with_output().await?;
162+
if !output.status.success() {
163+
anyhow::bail!("write failed: {}", String::from_utf8_lossy(&output.stderr));
164+
}
165+
Ok(())
166+
}
167+
146168
fn container_name(task_id: &str) -> String {
147169
let safe = task_id.replace('/', "-").replace(' ', "_");
148170
format!("swe-harness-{safe}")
@@ -186,6 +208,14 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
186208
let agent_dir_abs =
187209
std::fs::canonicalize(&config.agent_dir).unwrap_or_else(|_| config.agent_dir.clone());
188210

211+
// Auto-select Docker image based on task language unless overridden
212+
let docker_image = if config.docker_image == "python:3.12-slim" && task.language != "unknown" {
213+
super::docker_sandbox::image_for_language(&task.language).to_string()
214+
} else {
215+
config.docker_image.clone()
216+
};
217+
info!(task_id = %task.id, language = %task.language, image = %docker_image, "Selected Docker image");
218+
189219
// Remove stale container if exists
190220
docker_rm(&cname).await;
191221

@@ -202,7 +232,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
202232
&format!("{}:/agent:ro", agent_dir_abs.display()),
203233
"-w",
204234
"/repo",
205-
&config.docker_image,
235+
&docker_image,
206236
"sleep",
207237
"7200",
208238
])
@@ -243,7 +273,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
243273

244274
// Clone repo
245275
let clone_cmd = format!(
246-
"git clone --depth 100 https://github.com/{}.git /repo 2>&1",
276+
"git clone --depth 500 https://github.com/{}.git /repo 2>&1",
247277
task.repo
248278
);
249279
let (code, _, err) = docker_exec(&cname, &clone_cmd, 180).await;
@@ -261,8 +291,26 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
261291
)
262292
.await;
263293
if code != 0 {
264-
result.error = Some(format!("Checkout failed: {}", truncate(&err, 500)));
265-
return result;
294+
info!(task_id = %task.id, "Shallow clone missed commit, fetching full history...");
295+
let (fcode, _, _ferr) =
296+
docker_exec(&cname, "cd /repo && git fetch --unshallow 2>&1", 300).await;
297+
if fcode != 0 {
298+
result.error = Some(format!(
299+
"Checkout failed (even after unshallow): {}",
300+
truncate(&err, 500)
301+
));
302+
return result;
303+
}
304+
let (code2, _, err2) = docker_exec(
305+
&cname,
306+
&format!("cd /repo && git checkout {} --force 2>&1", task.base_commit),
307+
60,
308+
)
309+
.await;
310+
if code2 != 0 {
311+
result.error = Some(format!("Checkout failed: {}", truncate(&err2, 500)));
312+
return result;
313+
}
266314
}
267315
}
268316

@@ -289,6 +337,23 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
289337
warn!(task_id = %task.id, "Agent requirements install returned non-zero (continuing)");
290338
}
291339

340+
// Copy test files into container
341+
if let Some(test_files_json) = task.meta.get("test_files") {
342+
if let Ok(files) =
343+
serde_json::from_str::<Vec<super::test_generator::TestFile>>(test_files_json)
344+
{
345+
for tf in &files {
346+
let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path);
347+
docker_exec(&cname, &mkdir_cmd, 10).await;
348+
let write_result = docker_write_file(&cname, &tf.path, &tf.content).await;
349+
if let Err(e) = write_result {
350+
warn!(task_id = %task.id, path = %tf.path, "Failed to copy test file: {}", e);
351+
}
352+
}
353+
info!(task_id = %task.id, "Copied {} test files into container", files.len());
354+
}
355+
}
356+
292357
// 2. SANITY CHECK: fail_to_pass must fail, pass_to_pass must pass
293358
info!(task_id = %task.id, "Running sanity checks...");
294359

src/swe/pipeline.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ impl SwePipeline {
360360
enriched.stars,
361361
enriched.files_changed,
362362
added_lines,
363+
&enriched.changed_files,
363364
);
364365
filtered_count.fetch_add(1, Ordering::Relaxed);
365366
if !filter_result.accepted {

src/swe/quality.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub struct QualityConfig {
1414
impl Default for QualityConfig {
1515
fn default() -> Self {
1616
Self {
17-
min_quality_score: 0.1,
17+
min_quality_score: 0.25,
1818
}
1919
}
2020
}
@@ -335,7 +335,7 @@ impl QualityScorer {
335335
};
336336

337337
let score = classification.score.clamp(0.0, 1.0);
338-
let passed = score >= self.config.min_quality_score;
338+
let passed = score >= self.config.min_quality_score && classification.quality_good;
339339

340340
tracing::info!(
341341
task_id = %task.id,

src/swe/test_generator.rs

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use crate::swe::docker_sandbox::DockerSandbox;
1515
use crate::swe::SweTask;
1616

1717
const MAX_AGENT_TURNS: usize = 200;
18-
const MAX_VALIDATION_RETRIES: usize = 2;
18+
const MAX_VALIDATION_RETRIES: usize = 3;
1919

2020
const SYSTEM_PROMPT: &str = r#"You are a test engineer writing verification tests for GitHub pull requests for the SWE-bench benchmark.
2121
@@ -34,6 +34,8 @@ WORKFLOW:
3434
3. Find existing test suites covering code ADJACENT to the PR changes -- add them as pass_to_pass.
3535
4. Write NEW test files that exercise the BEHAVIOR introduced by the PR.
3636
5. Run your tests via `shell` to validate: fail_to_pass MUST fail, pass_to_pass MUST pass on base.
37+
5b. VERIFY pass_to_pass: Run each pass_to_pass command via `shell` and confirm exit code 0.
38+
If it fails, choose a different existing test or use a build command instead.
3739
6. Call `submit_tests` with everything.
3840
3941
MANDATORY RULES FOR TEST QUALITY:
@@ -58,6 +60,11 @@ MANDATORY RULES FOR TEST QUALITY:
5860
- If the project has a test suite, find relevant existing test commands and include them.
5961
- If the PR changes function_a() in a module, test that function_b() still works (pass_to_pass).
6062
- If the PR changes a class method, verify other methods on the same class are unaffected.
63+
- CRITICAL: pass_to_pass commands MUST use EXISTING test infrastructure that already works
64+
on the base commit. Run the command yourself via `shell` BEFORE submitting to verify it passes.
65+
- Do NOT create new test files for pass_to_pass. Use the project's existing test commands.
66+
- If no existing tests exist adjacent to the PR, use a simple build command (e.g., `cargo build`,
67+
`npm run build`, `go build ./...`) as pass_to_pass instead.
6168
6269
4. ROBUSTNESS & EDGE CASES (derive from the PR diff):
6370
- If the PR adds input validation: test with null, empty, oversized, malformed inputs.
@@ -316,6 +323,29 @@ impl TestGenerator {
316323
}
317324
}
318325

326+
if submit.fail_to_pass.is_empty() {
327+
if validation_retries < MAX_VALIDATION_RETRIES {
328+
validation_retries += 1;
329+
tracing::warn!(
330+
task_id = %task.id,
331+
retry = validation_retries,
332+
"Rejecting empty fail_to_pass"
333+
);
334+
messages.push(Message::tool_result(
335+
&tc.id,
336+
"REJECTED: fail_to_pass must contain at least one test command. \
337+
Write a test that FAILS on the base commit and PASSES after the PR patch is applied.".to_string(),
338+
));
339+
continue;
340+
}
341+
messages.push(Message::tool_result(
342+
&tc.id,
343+
"REJECTED: fail_to_pass is still empty after retries."
344+
.to_string(),
345+
));
346+
continue;
347+
}
348+
319349
// --- Heuristic: reject string-matching tests ---
320350
if let Some(rejection) = reject_string_matching_tests(&all_files) {
321351
if validation_retries < MAX_VALIDATION_RETRIES {
@@ -338,8 +368,15 @@ impl TestGenerator {
338368
}
339369
tracing::warn!(
340370
task_id = %task.id,
341-
"String-matching tests after max retries, accepting anyway"
371+
"String-matching tests after max retries, REJECTING"
342372
);
373+
messages.push(Message::tool_result(
374+
&tc.id,
375+
"REJECTED: Your tests still use forbidden source-reading patterns after multiple retries. \
376+
Rewrite completely: import modules, call functions, check return values. \
377+
Do NOT read source files.".to_string(),
378+
));
379+
continue;
343380
}
344381

345382
// --- Dual-commit validation: apply patch, re-run tests ---
@@ -369,8 +406,16 @@ impl TestGenerator {
369406
}
370407
tracing::warn!(
371408
task_id = %task.id,
372-
"Dual-commit validation failed after max retries, accepting with warning"
409+
"Dual-commit validation failed after max retries, REJECTING"
373410
);
411+
messages.push(Message::tool_result(
412+
&tc.id,
413+
format!(
414+
"REJECTED: {reason}\n\nYour tests failed dual-commit validation after multiple retries. \
415+
Rewrite your tests completely."
416+
),
417+
));
418+
continue;
374419
}
375420
ValidationResult::Accepted => {
376421
tracing::info!(
@@ -463,10 +508,12 @@ impl TestGenerator {
463508
if apply_3way.exit_code != 0 {
464509
tracing::warn!(
465510
stderr = %apply_3way.stderr,
466-
"Patch apply failed, skipping dual-commit validation"
511+
"Patch apply failed, rejecting task"
467512
);
468513
sandbox.exec("git checkout -- . 2>/dev/null", 10_000).await;
469-
return ValidationResult::Accepted;
514+
return ValidationResult::Rejected(
515+
"PR patch could not be applied to the base commit. The test cannot be validated.".to_string()
516+
);
470517
}
471518
}
472519

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python -m unittest tests/test_yquake2_riscv_config.py
2+
python -m compileall -q python-src
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# batocera-linux/batocera.linux-15418 (original PR)
2+
3+
batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet
4+
5+
(no description)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# batocera-linux/batocera.linux-15418
2+
3+
batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet
4+
5+
Disable or withhold the yquake2 package/build for RISC-V boards so it is not offered or built on those platforms until support is ready.

0 commit comments

Comments
 (0)