diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index d8b99165e97..ff5d5b81040 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1000,6 +1000,7 @@ dependencies = [ "codex-backend-client", "codex-common", "codex-core", + "codex-eval-case", "codex-feedback", "codex-file-search", "codex-login", @@ -1350,6 +1351,19 @@ dependencies = [ "wiremock", ] +[[package]] +name = "codex-eval-case" +version = "0.0.0" +dependencies = [ + "anyhow", + "pretty_assertions", + "serde", + "serde_json", + "tempfile", + "time", + "uuid", +] + [[package]] name = "codex-exec" version = "0.0.0" @@ -1724,6 +1738,7 @@ dependencies = [ "codex-backend-client", "codex-common", "codex-core", + "codex-eval-case", "codex-feedback", "codex-file-search", "codex-login", diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 20fd10ee9d3..56d8be9e91a 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -19,6 +19,7 @@ members = [ "exec-server", "execpolicy", "execpolicy-legacy", + "eval-case", "keyring-store", "file-search", "linux-sandbox", @@ -75,6 +76,7 @@ codex-common = { path = "common" } codex-core = { path = "core" } codex-exec = { path = "exec" } codex-execpolicy = { path = "execpolicy" } +codex-eval-case = { path = "eval-case" } codex-feedback = { path = "feedback" } codex-file-search = { path = "file-search" } codex-git = { path = "utils/git" } diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 83fa53b9973..0b522c918b8 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -174,6 +174,11 @@ client_request_definitions! { response: v2::FeedbackUploadResponse, }, + EvalCaseCreate => "evalCase/create" { + params: v2::EvalCaseCreateParams, + response: v2::EvalCaseCreateResponse, + }, + /// Execute a command (argv vector) under the server's sandbox. OneOffCommandExec => "command/exec" { params: v2::CommandExecParams, diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 7f09216eab3..28a90c39fff 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -930,6 +930,50 @@ pub struct FeedbackUploadResponse { pub thread_id: String, } +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] +#[serde(rename_all = "snake_case")] +#[ts(export_to = "v2/")] +pub enum EvalCaseStartMarkerKind { + RolloutLineTimestamp, + RolloutLineIndex, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] +#[serde(untagged)] +#[ts(export_to = "v2/")] +pub enum EvalCaseStartMarkerValue { + Timestamp(String), + LineIndex(u64), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct EvalCaseStartMarker { + pub kind: EvalCaseStartMarkerKind, + pub value: EvalCaseStartMarkerValue, + pub display: String, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct EvalCaseCreateParams { + pub thread_id: String, + pub start: EvalCaseStartMarker, + pub what_went_wrong: String, + pub what_good_looks_like: String, + pub include_logs: bool, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct EvalCaseCreateResponse { + pub case_id: String, + pub path: String, +} + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] diff --git a/codex-rs/app-server/Cargo.toml b/codex-rs/app-server/Cargo.toml index fbe9150a1a6..3e6e59b8220 100644 --- a/codex-rs/app-server/Cargo.toml +++ b/codex-rs/app-server/Cargo.toml @@ -21,6 +21,7 @@ codex-arg0 = { workspace = true } codex-common = { workspace = true, features = ["cli"] } codex-core = { workspace = true } codex-backend-client = { workspace = true } +codex-eval-case = { workspace = true } codex-file-search = { workspace = true } codex-login = { workspace = true } codex-protocol = { workspace = true } diff --git a/codex-rs/app-server/src/codex_message_processor.rs b/codex-rs/app-server/src/codex_message_processor.rs index d1804801d58..9e8959116fc 100644 --- a/codex-rs/app-server/src/codex_message_processor.rs +++ b/codex-rs/app-server/src/codex_message_processor.rs @@ -25,6 +25,8 @@ use codex_app_server_protocol::ClientRequest; use codex_app_server_protocol::CommandExecParams; use codex_app_server_protocol::ConversationGitInfo; use codex_app_server_protocol::ConversationSummary; +use codex_app_server_protocol::EvalCaseCreateParams; +use codex_app_server_protocol::EvalCaseCreateResponse; use codex_app_server_protocol::ExecOneOffCommandResponse; use codex_app_server_protocol::FeedbackUploadParams; use codex_app_server_protocol::FeedbackUploadResponse; @@ -512,6 +514,9 @@ impl CodexMessageProcessor { ClientRequest::FeedbackUpload { request_id, params } => { self.upload_feedback(request_id, params).await; } + ClientRequest::EvalCaseCreate { request_id, params } => { + self.eval_case_create(request_id, params).await; + } } } @@ -3301,6 +3306,170 @@ impl CodexMessageProcessor { } } + async fn eval_case_create(&self, request_id: RequestId, params: EvalCaseCreateParams) { + let EvalCaseCreateParams { + thread_id, + start, + what_went_wrong, + what_good_looks_like, + include_logs, + } = params; + + if !include_logs { + let error = JSONRPCErrorError { + code: INVALID_REQUEST_ERROR_CODE, + message: "eval case bundles always include codex-logs.log; set include_logs=true" + .to_string(), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + + let conversation_id = match ConversationId::from_string(&thread_id) { + Ok(id) => id, + Err(err) => { + let error = JSONRPCErrorError { + code: INVALID_REQUEST_ERROR_CODE, + message: format!("invalid thread id: {err}"), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + }; + + let Some(rollout_path) = self.resolve_rollout_path(conversation_id).await else { + let error = JSONRPCErrorError { + code: INVALID_REQUEST_ERROR_CODE, + message: "could not resolve rollout path for thread".to_string(), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + }; + + let rollout_text = match std::fs::read_to_string(&rollout_path) { + Ok(text) => text, + Err(err) => { + let error = JSONRPCErrorError { + code: INTERNAL_ERROR_CODE, + message: format!( + "failed to read rollout file {}: {err}", + rollout_path.display() + ), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + }; + + let start_marker = match (&start.kind, &start.value) { + ( + codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineTimestamp, + codex_app_server_protocol::EvalCaseStartMarkerValue::Timestamp(value), + ) => codex_eval_case::StartMarker { + kind: codex_eval_case::StartMarkerKind::RolloutLineTimestamp, + value: codex_eval_case::StartMarkerValue::Timestamp(value.clone()), + display: start.display.clone(), + }, + ( + codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex, + codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(value), + ) => codex_eval_case::StartMarker { + kind: codex_eval_case::StartMarkerKind::RolloutLineIndex, + value: codex_eval_case::StartMarkerValue::LineIndex(*value), + display: start.display.clone(), + }, + _ => { + let error = JSONRPCErrorError { + code: INVALID_REQUEST_ERROR_CODE, + message: "start marker kind/value mismatch".to_string(), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + }; + + let last_line_index = rollout_text.lines().count().saturating_sub(1); + let requested_start_is_now = start.display == "Start now" + || matches!((&start.kind, &start.value), + (codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex, + codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(index)) + if usize::try_from(*index).ok() == Some(last_line_index) + ); + + let repo_snapshot = repo_snapshot_from_rollout(&rollout_text, &start_marker); + if repo_snapshot.is_none() && !requested_start_is_now { + let error = JSONRPCErrorError { + code: INVALID_REQUEST_ERROR_CODE, + message: "no repo snapshot available for requested start marker".to_string(), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + + let logs_bytes = include_logs.then(|| { + self.feedback + .snapshot(Some(conversation_id)) + .as_bytes() + .to_vec() + }); + + let args = codex_eval_case::CreateEvalCaseArgs { + codex_home: self.config.codex_home.clone(), + conversation_id: thread_id.clone(), + rollout_path, + start: start_marker, + repo_cwd: self.config.cwd.clone(), + repo_snapshot, + notes: codex_eval_case::Notes { + what_went_wrong, + what_good_looks_like, + }, + include_logs, + logs_bytes, + }; + + let result = + tokio::task::spawn_blocking(move || codex_eval_case::create_eval_case_bundle(&args)) + .await; + + let result = match result { + Ok(outcome) => outcome, + Err(join_err) => { + let error = JSONRPCErrorError { + code: INTERNAL_ERROR_CODE, + message: format!("failed to create eval case bundle: {join_err}"), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + return; + } + }; + + match result { + Ok(outcome) => { + let response = EvalCaseCreateResponse { + case_id: outcome.case_id, + path: outcome.path.display().to_string(), + }; + self.outgoing.send_response(request_id, response).await; + } + Err(err) => { + let error = JSONRPCErrorError { + code: INTERNAL_ERROR_CODE, + message: format!("failed to create eval case bundle: {err}"), + data: None, + }; + self.outgoing.send_error(request_id, error).await; + } + } + } + async fn resolve_rollout_path(&self, conversation_id: ConversationId) -> Option { match self .conversation_manager @@ -3313,6 +3482,56 @@ impl CodexMessageProcessor { } } +fn repo_snapshot_from_rollout( + rollout_text: &str, + start_marker: &codex_eval_case::StartMarker, +) -> Option { + match &start_marker.value { + codex_eval_case::StartMarkerValue::LineIndex(index) => { + let index = usize::try_from(*index).ok()?; + repo_snapshot_from_rollout_line_index(rollout_text, index) + } + codex_eval_case::StartMarkerValue::Timestamp(timestamp) => { + repo_snapshot_from_rollout_timestamp(rollout_text, timestamp) + } + } +} + +fn repo_snapshot_from_rollout_timestamp( + rollout_text: &str, + timestamp: &str, +) -> Option { + let start_index = rollout_text.lines().enumerate().find_map(|(idx, line)| { + let rollout_line = + serde_json::from_str::(line).ok()?; + (rollout_line.timestamp == timestamp).then_some(idx) + })?; + repo_snapshot_from_rollout_line_index(rollout_text, start_index) +} + +fn repo_snapshot_from_rollout_line_index( + rollout_text: &str, + start_index: usize, +) -> Option { + rollout_text + .lines() + .enumerate() + .skip(start_index.saturating_add(1)) + .find_map(|(_idx, line)| { + let rollout_line = + serde_json::from_str::(line).ok()?; + match rollout_line.item { + codex_protocol::protocol::RolloutItem::ResponseItem( + codex_protocol::models::ResponseItem::GhostSnapshot { ghost_commit }, + ) => Some(codex_eval_case::RepoSnapshot { + base_sha: ghost_commit.parent()?.to_string(), + commit_sha: ghost_commit.id().to_string(), + }), + _ => None, + } + }) +} + fn skills_to_info( skills: &[codex_core::skills::SkillMetadata], ) -> Vec { @@ -3619,4 +3838,58 @@ mod tests { assert_eq!(summary, expected); Ok(()) } + + #[test] + fn repo_snapshot_from_rollout_line_index_finds_next_ghost_snapshot() -> Result<()> { + use serde_json::json; + + let user_line = json!({ + "timestamp": "2025-09-05T16:53:11.850Z", + "type": "response_item", + "payload": { + "type": "message", + "role": "user", + "content": [{ + "type": "input_text", + "text": "hi", + }], + }, + }); + + let ghost_line = json!({ + "timestamp": "2025-09-05T16:53:12.000Z", + "type": "response_item", + "payload": { + "type": "ghost_snapshot", + "ghost_commit": { + "id": "ghost-sha", + "parent": "base-sha", + "preexisting_untracked_files": [], + "preexisting_untracked_dirs": [], + }, + }, + }); + + let rollout_text = format!( + "{}\n{}\n", + serde_json::to_string(&user_line)?, + serde_json::to_string(&ghost_line)? + ); + + let start = codex_eval_case::StartMarker { + kind: codex_eval_case::StartMarkerKind::RolloutLineIndex, + value: codex_eval_case::StartMarkerValue::LineIndex(0), + display: "From: hi".to_string(), + }; + + let snapshot = repo_snapshot_from_rollout(&rollout_text, &start).expect("snapshot"); + assert_eq!( + snapshot, + codex_eval_case::RepoSnapshot { + base_sha: "base-sha".to_string(), + commit_sha: "ghost-sha".to_string(), + } + ); + Ok(()) + } } diff --git a/codex-rs/apply-patch/src/lib.rs b/codex-rs/apply-patch/src/lib.rs index f58055f4505..5d8a1afa530 100644 --- a/codex-rs/apply-patch/src/lib.rs +++ b/codex-rs/apply-patch/src/lib.rs @@ -209,34 +209,19 @@ pub fn apply_hunks( stdout: &mut impl std::io::Write, stderr: &mut impl std::io::Write, ) -> Result<(), ApplyPatchError> { - let _existing_paths: Vec<&Path> = hunks - .iter() - .filter_map(|hunk| match hunk { - Hunk::AddFile { .. } => { - // The file is being added, so it doesn't exist yet. - None - } - Hunk::DeleteFile { path } => Some(path.as_path()), - Hunk::UpdateFile { - path, move_path, .. - } => match move_path { - Some(move_path) => { - if std::fs::metadata(move_path) - .map(|m| m.is_file()) - .unwrap_or(false) - { - Some(move_path.as_path()) - } else { - None - } - } - None => Some(path.as_path()), - }, - }) - .collect::>(); + apply_hunks_in_dir(hunks, Path::new("."), stdout, stderr) +} +/// Applies hunks relative to `cwd` (without mutating the process working directory) and writes the +/// same stdout/stderr output as `apply_hunks`. +pub fn apply_hunks_in_dir( + hunks: &[Hunk], + cwd: &Path, + stdout: &mut impl std::io::Write, + stderr: &mut impl std::io::Write, +) -> Result<(), ApplyPatchError> { // Delegate to a helper that applies each hunk to the filesystem. - match apply_hunks_to_files(hunks) { + match apply_hunks_to_files_in_dir(hunks, cwd) { Ok(affected) => { print_summary(&affected, stdout).map_err(ApplyPatchError::from)?; Ok(()) @@ -267,7 +252,7 @@ pub struct AffectedPaths { /// Apply the hunks to the filesystem, returning which files were added, modified, or deleted. /// Returns an error if the patch could not be applied. -fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result { +fn apply_hunks_to_files_in_dir(hunks: &[Hunk], cwd: &Path) -> anyhow::Result { if hunks.is_empty() { anyhow::bail!("No files were modified."); } @@ -278,19 +263,29 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result { for hunk in hunks { match hunk { Hunk::AddFile { path, contents } => { - if let Some(parent) = path.parent() + let target_path = if path.is_absolute() { + path.clone() + } else { + cwd.join(path) + }; + if let Some(parent) = target_path.parent() && !parent.as_os_str().is_empty() { std::fs::create_dir_all(parent).with_context(|| { format!("Failed to create parent directories for {}", path.display()) })?; } - std::fs::write(path, contents) + std::fs::write(&target_path, contents) .with_context(|| format!("Failed to write file {}", path.display()))?; added.push(path.clone()); } Hunk::DeleteFile { path } => { - std::fs::remove_file(path) + let target_path = if path.is_absolute() { + path.clone() + } else { + cwd.join(path) + }; + std::fs::remove_file(&target_path) .with_context(|| format!("Failed to delete file {}", path.display()))?; deleted.push(path.clone()); } @@ -300,22 +295,38 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result { chunks, } => { let AppliedPatch { new_contents, .. } = - derive_new_contents_from_chunks(path, chunks)?; + derive_new_contents_from_chunks_in_dir(path, chunks, cwd)?; if let Some(dest) = move_path { - if let Some(parent) = dest.parent() + let dest_path = if dest.is_absolute() { + dest.clone() + } else { + cwd.join(dest) + }; + if let Some(parent) = dest_path.parent() && !parent.as_os_str().is_empty() { std::fs::create_dir_all(parent).with_context(|| { format!("Failed to create parent directories for {}", dest.display()) })?; } - std::fs::write(dest, new_contents) + std::fs::write(&dest_path, new_contents) .with_context(|| format!("Failed to write file {}", dest.display()))?; - std::fs::remove_file(path) + + let src_path = if path.is_absolute() { + path.clone() + } else { + cwd.join(path) + }; + std::fs::remove_file(&src_path) .with_context(|| format!("Failed to remove original {}", path.display()))?; modified.push(dest.clone()); } else { - std::fs::write(path, new_contents) + let target_path = if path.is_absolute() { + path.clone() + } else { + cwd.join(path) + }; + std::fs::write(&target_path, new_contents) .with_context(|| format!("Failed to write file {}", path.display()))?; modified.push(path.clone()); } @@ -340,7 +351,21 @@ fn derive_new_contents_from_chunks( path: &Path, chunks: &[UpdateFileChunk], ) -> std::result::Result { - let original_contents = match std::fs::read_to_string(path) { + derive_new_contents_from_chunks_in_dir(path, chunks, Path::new(".")) +} + +fn derive_new_contents_from_chunks_in_dir( + path: &Path, + chunks: &[UpdateFileChunk], + cwd: &Path, +) -> std::result::Result { + let target_path = if path.is_absolute() { + path.to_path_buf() + } else { + cwd.join(path) + }; + + let original_contents = match std::fs::read_to_string(&target_path) { Ok(contents) => contents, Err(err) => { return Err(ApplyPatchError::IoError(IoError { diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 5fb86e8718e..e3617fd4800 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -343,6 +343,7 @@ pub(crate) struct Session { /// The set of enabled features should be invariant for the lifetime of the /// session. features: Features, + repo_snapshotting_enabled: AtomicBool, pub(crate) active_turn: Mutex>, pub(crate) services: SessionServices, next_internal_sub_id: AtomicU64, @@ -673,6 +674,7 @@ impl Session { tx_event: tx_event.clone(), state: Mutex::new(state), features: config.features.clone(), + repo_snapshotting_enabled: AtomicBool::new(false), active_turn: Mutex::new(None), services, next_internal_sub_id: AtomicU64::new(0), @@ -1439,7 +1441,9 @@ impl Session { turn_context: Arc, cancellation_token: CancellationToken, ) { - if !self.enabled(Feature::GhostCommit) { + if !self.enabled(Feature::GhostCommit) + && !self.repo_snapshotting_enabled.load(Ordering::Relaxed) + { return; } let token = match turn_context.tool_call_gate.subscribe().await { @@ -1609,6 +1613,9 @@ async fn submission_loop(sess: Arc, config: Arc, rx_sub: Receiv ) .await; } + Op::SetRepoSnapshotting { enabled } => { + handlers::set_repo_snapshotting(&sess, enabled).await; + } Op::UserInput { .. } | Op::UserTurn { .. } => { handlers::user_input_or_turn(&sess, sub.id.clone(), sub.op, &mut previous_context) .await; @@ -1707,6 +1714,7 @@ mod handlers { use mcp_types::RequestId; use std::path::PathBuf; use std::sync::Arc; + use std::sync::atomic::Ordering; use tracing::info; use tracing::warn; @@ -1731,6 +1739,11 @@ mod handlers { } } + pub async fn set_repo_snapshotting(sess: &Session, enabled: bool) { + sess.repo_snapshotting_enabled + .store(enabled, Ordering::Relaxed); + } + pub async fn user_input_or_turn( sess: &Arc, sub_id: String, @@ -3161,6 +3174,7 @@ mod tests { tx_event, state: Mutex::new(state), features: config.features.clone(), + repo_snapshotting_enabled: AtomicBool::new(false), active_turn: Mutex::new(None), services, next_internal_sub_id: AtomicU64::new(0), @@ -3248,6 +3262,7 @@ mod tests { tx_event, state: Mutex::new(state), features: config.features.clone(), + repo_snapshotting_enabled: AtomicBool::new(false), active_turn: Mutex::new(None), services, next_internal_sub_id: AtomicU64::new(0), diff --git a/codex-rs/core/src/tools/runtimes/apply_patch.rs b/codex-rs/core/src/tools/runtimes/apply_patch.rs index 26d04f578c5..51c94922a30 100644 --- a/codex-rs/core/src/tools/runtimes/apply_patch.rs +++ b/codex-rs/core/src/tools/runtimes/apply_patch.rs @@ -23,6 +23,7 @@ use codex_protocol::protocol::ReviewDecision; use futures::future::BoxFuture; use std::collections::HashMap; use std::path::PathBuf; +use std::time::Instant; #[derive(Clone, Debug)] pub struct ApplyPatchRequest { @@ -143,6 +144,48 @@ impl ToolRuntime for ApplyPatchRuntime { attempt: &SandboxAttempt<'_>, ctx: &ToolCtx<'_>, ) -> Result { + // When there is no sandbox in play (DangerFullAccess / bypassed sandbox), apply the patch + // in-process. This avoids relying on the current executable implementing the arg0/argv1 + // dispatch behavior (which is not true for unit/integration test binaries). + if attempt.sandbox == crate::exec::SandboxType::None { + let started = Instant::now(); + let mut stdout = Vec::new(); + let mut stderr = Vec::new(); + + // Apply the patch relative to `req.cwd` without mutating the process CWD. + let exit_code = match codex_apply_patch::parse_patch(&req.patch) { + Ok(args) => { + match codex_apply_patch::apply_hunks_in_dir( + &args.hunks, + &req.cwd, + &mut stdout, + &mut stderr, + ) { + Ok(()) => 0, + Err(_) => 1, + } + } + // Reuse codex-apply-patch's error formatting for parse errors. + Err(_) => { + match codex_apply_patch::apply_patch(&req.patch, &mut stdout, &mut stderr) { + Ok(()) => 0, + Err(_) => 1, + } + } + }; + let stdout = String::from_utf8_lossy(&stdout).to_string(); + let stderr = String::from_utf8_lossy(&stderr).to_string(); + let aggregated_output = format!("{stdout}{stderr}"); + return Ok(ExecToolCallOutput { + exit_code, + stdout: crate::exec::StreamOutput::new(stdout), + stderr: crate::exec::StreamOutput::new(stderr), + aggregated_output: crate::exec::StreamOutput::new(aggregated_output), + duration: started.elapsed(), + timed_out: false, + }); + } + let spec = Self::build_command_spec(req)?; let env = attempt .env_for(spec) diff --git a/codex-rs/core/tests/suite/shell_snapshot.rs b/codex-rs/core/tests/suite/shell_snapshot.rs index 8357fb8a95a..3606e8353e6 100644 --- a/codex-rs/core/tests/suite/shell_snapshot.rs +++ b/codex-rs/core/tests/suite/shell_snapshot.rs @@ -8,6 +8,7 @@ use codex_core::protocol::Op; use codex_core::protocol::SandboxPolicy; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::user_input::UserInput; +use core_test_support::responses::ResponsesRequest; use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_completed; use core_test_support::responses::ev_function_call; @@ -277,7 +278,7 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> { ev_completed("resp-2"), ]), ]; - mount_sse_sequence(harness.server(), responses).await; + let requests = mount_sse_sequence(harness.server(), responses).await; let model = test.session_configured.model.clone(); codex @@ -297,7 +298,14 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> { wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; - assert_eq!(fs::read_to_string(&target).await?, "hello from snapshot\n"); + let tool_output = extract_call_output_text(&requests.requests(), call_id); + let target_contents = fs::read_to_string(&target).await.unwrap_or_else(|err| { + panic!( + "expected patch to create {}: {err}; tool output={tool_output:?}", + target.display() + ) + }); + assert_eq!(target_contents, "hello from snapshot\n"); let mut entries = fs::read_dir(codex_home.join("shell_snapshots")).await?; let snapshot_path = entries @@ -311,6 +319,35 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> { Ok(()) } +fn extract_call_output_text(requests: &[ResponsesRequest], call_id: &str) -> String { + for req in requests { + let input = req.input(); + let item = input.iter().find(|item| { + item.get("type").and_then(serde_json::Value::as_str) == Some("function_call_output") + && item.get("call_id").and_then(serde_json::Value::as_str) == Some(call_id) + }); + let Some(item) = item else { + continue; + }; + + let output = item + .get("output") + .cloned() + .unwrap_or(serde_json::Value::Null); + match output { + serde_json::Value::String(text) => return text, + serde_json::Value::Object(obj) => { + if let Some(text) = obj.get("content").and_then(serde_json::Value::as_str) { + return text.to_string(); + } + } + _ => {} + } + } + + "".to_string() +} + #[cfg_attr(target_os = "windows", ignore)] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn shell_snapshot_deleted_after_shutdown_with_skills() -> Result<()> { diff --git a/codex-rs/docs/eval_capture.md b/codex-rs/docs/eval_capture.md new file mode 100644 index 00000000000..0a9a9d46d43 --- /dev/null +++ b/codex-rs/docs/eval_capture.md @@ -0,0 +1,58 @@ +# Eval Capture Bundles + +Codex can capture "eval case" bundles from the `/feedback` flow (bad result -> capture eval sample). +These bundles are meant to turn real failures into reproducible, local-first artifacts. + +## Where Bundles Are Written + +Bundles are stored under: + +`$CODEX_HOME/eval-case//` + +## Bundle Contents + +Each bundle contains: + +- `manifest.json` - metadata about the capture (schema version, rollout start selector, repo git pointer). +- `rollout.jsonl` - the full session rollout (multi-turn trajectory). +- `repo.patch` - a git patch representing the repository state at the chosen start marker. +- `codex-logs.log` - tracing logs to help maintainers debug the session. + +## Start Marker And Repo State + +Bundles include the entire rollout, but also record a start marker to indicate where an eval +harness (or a human) should begin replaying/interpreting the trajectory. + +`manifest.json` also records a `rollout.start_selector`, which is the preferred way to find the +intended starting turn when replaying/slicing (line numbers in `rollout.jsonl` are not stable). + +The repository patch must match that chosen start marker: + +- If the session has repo snapshots available, `repo.patch` is derived from the ghost snapshot + commit associated with the selected user turn (diff from the snapshot's base commit to the + snapshot commit). +- If no snapshot is available for a given start marker, the TUI disables that option (and may + fall back to the basic feedback flow instead). + +For reproducibility outside your machine, the base commit recorded in `manifest.json` should be +reachable by maintainers (for example, pushed and available on the default branch). + +## Reproducibility Contract + +Reproducing the repository state in a bundle must be possible using only: + +- `repo.git.canonical_remote` (clone URL) +- `repo.git.commit` (base commit to `git checkout`) +- `repo.patch` (apply if non-empty) + +The manifest may also include machine-local paths (like `repo.root` / `repo.cwd_rel`) as debugging +hints, but repro should not depend on them. + +## App-Server API (For Integrations) + +Non-TUI clients can create bundles via the app-server JSON-RPC method: + +- `evalCase/create` + +The handler copies the rollout into the bundle and derives `repo.patch` based on the selected start +marker when repo snapshots are available. diff --git a/codex-rs/eval-case/Cargo.toml b/codex-rs/eval-case/Cargo.toml new file mode 100644 index 00000000000..19542c9c813 --- /dev/null +++ b/codex-rs/eval-case/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "codex-eval-case" +version.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +anyhow = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +time = { workspace = true, features = ["formatting", "parsing"] } +uuid = { workspace = true, features = ["v4"] } + +[dev-dependencies] +pretty_assertions = { workspace = true } +tempfile = { workspace = true } diff --git a/codex-rs/eval-case/src/lib.rs b/codex-rs/eval-case/src/lib.rs new file mode 100644 index 00000000000..17976e9a556 --- /dev/null +++ b/codex-rs/eval-case/src/lib.rs @@ -0,0 +1,993 @@ +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context as _; +use serde::Deserialize; +use serde::Serialize; +use time::OffsetDateTime; +use time::format_description::well_known::Rfc3339; +use uuid::Uuid; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum StartMarkerKind { + RolloutLineTimestamp, + RolloutLineIndex, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(untagged)] +pub enum StartMarkerValue { + Timestamp(String), + LineIndex(u64), +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct StartMarker { + pub kind: StartMarkerKind, + pub value: StartMarkerValue, + pub display: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum RolloutStartSelectorKind { + /// Find the first `event_msg` line where `payload.type == "user_message"` and + /// `payload.message` contains `contains`. + EventMsgUserMessageContains, + /// Find the first `response_item` line where `payload.role == "user"` and the combined text + /// from `payload.content[*].text` contains `contains`. + ResponseItemUserMessageContains, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RolloutInfo { + pub filename: String, + /// Preferred, deterministic selector for slicing the rollout when reproducing a case. + pub start_selector: RolloutStartSelector, + /// Debugging hint only; do not use this for correctness when slicing. + pub start: StartMarker, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RolloutStartSelector { + pub kind: RolloutStartSelectorKind, + pub contains: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub after_timestamp: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GitRemote { + pub name: String, + pub fetch_url: String, + pub push_url: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RepoGitInfo { + /// Base commit to `git checkout` before applying `repo.patch`. + pub commit: String, + pub remotes: Vec, + pub canonical_remote: Option, + pub reproducible: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub reproducible_reason: Option, + pub is_dirty: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub describe: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RepoInfo { + /// Repository root as reported by `git rev-parse --show-toplevel` (optional). + #[serde(skip_serializing_if = "Option::is_none")] + pub root: Option, + /// Relative path from `repo.root` to capture-time cwd (optional). + #[serde(skip_serializing_if = "Option::is_none")] + pub cwd_rel: Option, + pub git: RepoGitInfo, + pub patch_filename: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Notes { + pub what_went_wrong: String, + pub what_good_looks_like: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Artifacts { + pub include_logs: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct EvalCaseManifestV1 { + pub version: String, + pub case_id: String, + pub created_at: String, + pub conversation_id: String, + pub source: String, + pub rollout: RolloutInfo, + pub repo: RepoInfo, + pub notes: Notes, + pub artifacts: Artifacts, +} + +#[derive(Debug, Clone)] +pub struct CreateEvalCaseArgs { + pub codex_home: PathBuf, + pub conversation_id: String, + pub rollout_path: PathBuf, + pub start: StartMarker, + pub repo_cwd: PathBuf, + /// When present, derive `repo.patch` from the provided commit snapshot instead of the current + /// working tree. + pub repo_snapshot: Option, + pub notes: Notes, + pub include_logs: bool, + pub logs_bytes: Option>, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RepoSnapshot { + pub base_sha: String, + pub commit_sha: String, +} + +#[derive(Debug, Clone)] +pub struct CreateEvalCaseResult { + pub case_id: String, + pub path: PathBuf, +} + +fn sanitize_repo_slug(input: &str) -> String { + // Keep it short + filesystem-friendly. + let mut out = String::with_capacity(input.len()); + let mut last_was_dash = false; + for ch in input.chars() { + let ch = ch.to_ascii_lowercase(); + if ch.is_ascii_alphanumeric() { + out.push(ch); + last_was_dash = false; + continue; + } + if !last_was_dash { + out.push('-'); + last_was_dash = true; + } + } + let out = out.trim_matches('-').to_string(); + if out.is_empty() { + "repo".to_string() + } else { + out + } +} + +fn repo_slug(repo_cwd: &Path) -> String { + // Prefer the git repo root name, but fall back to the cwd basename. + let top_level = git_stdout(repo_cwd, &["rev-parse", "--show-toplevel"]) + .ok() + .map(PathBuf::from); + let basename = top_level + .as_ref() + .and_then(|p| p.file_name()) + .or_else(|| repo_cwd.file_name()); + let basename = basename + .map(|name| name.to_string_lossy().to_string()) + .unwrap_or_else(|| "repo".to_string()); + sanitize_repo_slug(&basename) +} + +pub fn create_eval_case_bundle(args: &CreateEvalCaseArgs) -> anyhow::Result { + let created_at = OffsetDateTime::now_utc(); + let created_at_rfc3339 = created_at.format(&Rfc3339).context("format created_at")?; + + let ts_for_id = format!( + "{:04}-{:02}-{:02}T{:02}-{:02}-{:02}", + created_at.year(), + u8::from(created_at.month()), + created_at.day(), + created_at.hour(), + created_at.minute(), + created_at.second() + ); + + // Short, human-scannable id: datetime + repo + 6-digit suffix. + // Collision risk is low and acceptable for local bundles. + let repo = repo_slug(&args.repo_cwd); + let id6 = (Uuid::new_v4().as_u128() % 1_000_000) as u32; + let case_id = format!("{ts_for_id}-{repo}-{id6:06}"); + + let bundle_dir = args.codex_home.join("eval-case").join(&case_id); + std::fs::create_dir_all(&bundle_dir) + .with_context(|| format!("create eval bundle dir {}", bundle_dir.display()))?; + + let rollout_dst = bundle_dir.join("rollout.jsonl"); + std::fs::copy(&args.rollout_path, &rollout_dst).with_context(|| { + format!( + "copy rollout {} -> {}", + args.rollout_path.display(), + rollout_dst.display() + ) + })?; + + let rollout_text = std::fs::read_to_string(&args.rollout_path).with_context(|| { + format!( + "read rollout for start selector {}", + args.rollout_path.display() + ) + })?; + + let (base_sha, patch) = match args.repo_snapshot.as_ref() { + Some(snapshot) => { + git_patch_between_commits(&args.repo_cwd, &snapshot.base_sha, &snapshot.commit_sha) + .with_context(|| { + format!( + "generate patch for repo snapshot {}..{}", + snapshot.base_sha, snapshot.commit_sha + ) + })? + } + None => git_patch_against_head(&args.repo_cwd)?, + }; + let patch_path = bundle_dir.join("repo.patch"); + std::fs::write(&patch_path, patch) + .with_context(|| format!("write patch {}", patch_path.display()))?; + + if args.include_logs { + let logs_path = bundle_dir.join("codex-logs.log"); + let bytes = args + .logs_bytes + .clone() + .unwrap_or_else(|| Vec::with_capacity(0)); + std::fs::write(&logs_path, bytes) + .with_context(|| format!("write logs {}", logs_path.display()))?; + } + + let repo_root = git_stdout(&args.repo_cwd, &["rev-parse", "--show-toplevel"]) + .ok() + .filter(|s| !s.is_empty()); + let cwd_rel = repo_root + .as_ref() + .and_then(|root| { + let root = PathBuf::from(root); + args.repo_cwd + .strip_prefix(&root) + .ok() + .map(|p| p.display().to_string()) + }) + .filter(|s| !s.is_empty()); + + let (git_remotes, remotes_ok) = match git_remotes(&args.repo_cwd) { + Ok(remotes) => (remotes, true), + Err(_) => (Vec::new(), false), + }; + let canonical_remote = select_canonical_remote(&git_remotes); + let git_is_dirty = git_is_dirty(&args.repo_cwd).unwrap_or(false); + let git_describe = git_stdout( + &args.repo_cwd, + &["describe", "--tags", "--always", "--dirty"], + ) + .ok() + .filter(|s| !s.is_empty()); + + let mut reproducible = true; + let mut reproducible_reason: Option = None; + if !remotes_ok { + reproducible = false; + reproducible_reason = Some("not_a_git_repo".to_string()); + } else if canonical_remote.is_none() { + reproducible = false; + reproducible_reason = Some("no_git_remote".to_string()); + } + if base_sha.len() != 40 || base_sha == "unknown" { + reproducible = false; + if reproducible_reason.is_none() { + reproducible_reason = Some("unknown_commit".to_string()); + } + } + + let start_selector = derive_rollout_start_selector(&args.start, &rollout_text); + + let manifest = EvalCaseManifestV1 { + version: "v1".to_string(), + case_id: case_id.clone(), + created_at: created_at_rfc3339, + conversation_id: args.conversation_id.clone(), + source: "cli".to_string(), + rollout: RolloutInfo { + filename: "rollout.jsonl".to_string(), + start_selector, + start: args.start.clone(), + }, + repo: RepoInfo { + root: repo_root, + cwd_rel, + git: RepoGitInfo { + commit: base_sha, + remotes: git_remotes, + canonical_remote, + reproducible, + reproducible_reason, + is_dirty: git_is_dirty, + describe: git_describe, + }, + patch_filename: "repo.patch".to_string(), + }, + notes: args.notes.clone(), + artifacts: Artifacts { + include_logs: args.include_logs, + }, + }; + let manifest_path = bundle_dir.join("manifest.json"); + let manifest_json = serde_json::to_string_pretty(&manifest).context("serialize manifest")?; + std::fs::write(&manifest_path, format!("{manifest_json}\n")) + .with_context(|| format!("write manifest {}", manifest_path.display()))?; + + Ok(CreateEvalCaseResult { + case_id, + path: bundle_dir, + }) +} + +fn normalize_whitespace(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for part in s.split_whitespace() { + if !out.is_empty() { + out.push(' '); + } + out.push_str(part); + } + out +} + +fn truncate_chars(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + return s.to_string(); + } + s.chars().take(max_chars).collect() +} + +fn should_show_start_message(message: &str) -> bool { + // The rollout can include synthetic "user" messages (environment context, AGENTS + // instructions) that should not be used as start selectors. + let trimmed = message.trim_start(); + if trimmed.starts_with("") { + return false; + } + if trimmed.starts_with("# AGENTS.md instructions") { + return false; + } + true +} + +fn parse_rfc3339(s: &str) -> Option { + OffsetDateTime::parse(s, &Rfc3339).ok() +} + +fn is_after_timestamp(timestamp: &str, after_timestamp: &str) -> bool { + match (parse_rfc3339(timestamp), parse_rfc3339(after_timestamp)) { + (Some(ts), Some(after)) => ts >= after, + // Fall back to lexicographic compare; RFC3339 timestamps with consistent formatting + // should still compare correctly. + _ => timestamp >= after_timestamp, + } +} + +fn derive_rollout_start_selector(start: &StartMarker, rollout_text: &str) -> RolloutStartSelector { + let after_timestamp = match &start.value { + StartMarkerValue::Timestamp(ts) => Some(ts.clone()), + StartMarkerValue::LineIndex(idx) => { + let idx = usize::try_from(*idx).ok(); + idx.and_then(|i| rollout_text.lines().nth(i)) + .and_then(|line| { + serde_json::from_str::(line) + .ok() + .and_then(|v| { + v.get("timestamp") + .and_then(serde_json::Value::as_str) + .map(str::to_string) + }) + }) + } + }; + + let (kind, message) = if let Some(message) = + find_first_event_msg_user_message(rollout_text, after_timestamp.as_deref()) + { + ( + RolloutStartSelectorKind::EventMsgUserMessageContains, + Some(message), + ) + } else if let Some(message) = + find_first_response_item_user_message(rollout_text, after_timestamp.as_deref()) + { + ( + RolloutStartSelectorKind::ResponseItemUserMessageContains, + Some(message), + ) + } else { + (RolloutStartSelectorKind::EventMsgUserMessageContains, None) + }; + + let contains = message + .map(|m| truncate_chars(&normalize_whitespace(&m), 200)) + .unwrap_or_default(); + + RolloutStartSelector { + kind, + contains, + after_timestamp, + } +} + +fn find_first_event_msg_user_message( + rollout_text: &str, + after_timestamp: Option<&str>, +) -> Option { + rollout_text.lines().find_map(|line| { + let v = serde_json::from_str::(line).ok()?; + let timestamp = v.get("timestamp")?.as_str()?; + if let Some(after) = after_timestamp + && !is_after_timestamp(timestamp, after) + { + return None; + } + + let ty = v.get("type")?.as_str()?; + if ty != "event_msg" { + return None; + } + let payload = v.get("payload")?; + let payload_ty = payload.get("type")?.as_str()?; + if payload_ty != "user_message" { + return None; + } + let message = payload.get("message")?.as_str()?; + should_show_start_message(message).then(|| message.to_string()) + }) +} + +fn find_first_response_item_user_message( + rollout_text: &str, + after_timestamp: Option<&str>, +) -> Option { + rollout_text.lines().find_map(|line| { + let v = serde_json::from_str::(line).ok()?; + let timestamp = v.get("timestamp")?.as_str()?; + if let Some(after) = after_timestamp + && !is_after_timestamp(timestamp, after) + { + return None; + } + + let ty = v.get("type")?.as_str()?; + if ty != "response_item" { + return None; + } + let payload = v.get("payload")?; + let role = payload.get("role")?.as_str()?; + if role != "user" { + return None; + } + let content = payload.get("content")?.as_array()?; + let mut out = String::new(); + for item in content { + let Some(text) = item.get("text").and_then(serde_json::Value::as_str) else { + continue; + }; + if !out.is_empty() { + out.push(' '); + } + out.push_str(text); + } + let out = normalize_whitespace(out.as_str()); + (!out.is_empty() && should_show_start_message(&out)).then_some(out) + }) +} + +pub fn find_rollout_start_index( + rollout_text: &str, + selector: &RolloutStartSelector, +) -> Option { + if selector.contains.trim().is_empty() { + return None; + } + + let lines = rollout_text.lines().collect::>(); + let matched_idx = lines.iter().enumerate().find_map(|(idx, line)| { + let v = serde_json::from_str::(line).ok()?; + let timestamp = v.get("timestamp")?.as_str()?; + if let Some(after) = selector.after_timestamp.as_deref() + && !is_after_timestamp(timestamp, after) + { + return None; + } + + match selector.kind { + RolloutStartSelectorKind::EventMsgUserMessageContains => { + let ty = v.get("type")?.as_str()?; + if ty != "event_msg" { + return None; + } + let payload = v.get("payload")?; + let payload_ty = payload.get("type")?.as_str()?; + if payload_ty != "user_message" { + return None; + } + let message = payload.get("message")?.as_str()?; + message.contains(selector.contains.as_str()).then_some(idx) + } + RolloutStartSelectorKind::ResponseItemUserMessageContains => { + let ty = v.get("type")?.as_str()?; + if ty != "response_item" { + return None; + } + let payload = v.get("payload")?; + let role = payload.get("role")?.as_str()?; + if role != "user" { + return None; + } + let content = payload.get("content")?.as_array()?; + let mut out = String::new(); + for item in content { + let Some(text) = item.get("text").and_then(serde_json::Value::as_str) else { + continue; + }; + if !out.is_empty() { + out.push(' '); + } + out.push_str(text); + } + let out = normalize_whitespace(out.as_str()); + out.contains(selector.contains.as_str()).then_some(idx) + } + } + })?; + + // Include immediately preceding turn context lines so replay has the right environment/config. + let mut start_idx = matched_idx; + while start_idx > 0 { + let prev_line = lines.get(start_idx.saturating_sub(1))?; + let Ok(v) = serde_json::from_str::(prev_line) else { + break; + }; + let Some(ty) = v.get("type").and_then(serde_json::Value::as_str) else { + break; + }; + if ty != "turn_context" { + break; + } + start_idx = start_idx.saturating_sub(1); + } + + Some(start_idx) +} + +pub fn slice_rollout_from_selector( + rollout_text: &str, + selector: &RolloutStartSelector, +) -> Option { + let start = find_rollout_start_index(rollout_text, selector)?; + let sliced = rollout_text + .lines() + .skip(start) + .collect::>() + .join("\n"); + Some(format!("{sliced}\n")) +} + +fn git_patch_against_head(repo_cwd: &Path) -> anyhow::Result<(String, Vec)> { + let base_sha = + git_stdout(repo_cwd, &["rev-parse", "HEAD"]).unwrap_or_else(|_| "unknown".to_string()); + + let mut patch = Vec::new(); + if let Ok(mut bytes) = git_diff( + repo_cwd, + &["diff", "--no-textconv", "--no-ext-diff", "--binary", "HEAD"], + ) { + patch.append(&mut bytes); + } + + let untracked = + git_stdout(repo_cwd, &["ls-files", "--others", "--exclude-standard"]).unwrap_or_default(); + for file in untracked.lines().map(str::trim).filter(|s| !s.is_empty()) { + let null_device = if cfg!(windows) { "NUL" } else { "/dev/null" }; + let args = [ + "diff", + "--no-textconv", + "--no-ext-diff", + "--binary", + "--no-index", + "--", + null_device, + file, + ]; + if let Ok(mut bytes) = git_diff(repo_cwd, &args) { + patch.append(&mut bytes); + } + } + + Ok((base_sha, patch)) +} + +fn git_remotes(repo_cwd: &Path) -> anyhow::Result> { + let output = git_stdout(repo_cwd, &["remote", "-v"])?; + // Use a BTreeMap so iteration is deterministic. + let mut by_name: std::collections::BTreeMap, Vec)> = + std::collections::BTreeMap::new(); + + for line in output.lines().map(str::trim).filter(|s| !s.is_empty()) { + let mut parts = line.split_whitespace(); + let Some(name) = parts.next() else { + continue; + }; + let Some(url) = parts.next() else { + continue; + }; + let Some(kind) = parts.next() else { + continue; + }; + + let entry = by_name + .entry(name.to_string()) + .or_insert_with(|| (Vec::new(), Vec::new())); + match kind { + "(fetch)" => entry.0.push(url.to_string()), + "(push)" => entry.1.push(url.to_string()), + _ => {} + } + } + + let remotes = by_name + .into_iter() + .filter_map(|(name, (mut fetch_urls, mut push_urls))| { + fetch_urls.sort(); + push_urls.sort(); + let fetch_url = fetch_urls.into_iter().next()?; + let push_url = push_urls + .into_iter() + .next() + .unwrap_or_else(|| fetch_url.clone()); + Some(GitRemote { + name, + fetch_url, + push_url, + }) + }) + .collect::>(); + Ok(remotes) +} + +fn select_canonical_remote(remotes: &[GitRemote]) -> Option { + if let Some(origin) = remotes.iter().find(|r| r.name == "origin") { + return Some(origin.fetch_url.clone()); + } + remotes + .iter() + .min_by(|a, b| a.name.cmp(&b.name)) + .map(|r| r.fetch_url.clone()) +} + +fn git_is_dirty(repo_cwd: &Path) -> anyhow::Result { + let diff_unstaged = Command::new("git") + .args(["diff", "--quiet"]) + .current_dir(repo_cwd) + .status() + .context("run git diff --quiet")?; + if !diff_unstaged.success() { + return Ok(true); + } + + let diff_staged = Command::new("git") + .args(["diff", "--cached", "--quiet"]) + .current_dir(repo_cwd) + .status() + .context("run git diff --cached --quiet")?; + if !diff_staged.success() { + return Ok(true); + } + + let untracked = + git_stdout(repo_cwd, &["ls-files", "--others", "--exclude-standard"]).unwrap_or_default(); + Ok(untracked.lines().any(|s| !s.trim().is_empty())) +} + +fn git_stdout(repo_cwd: &Path, args: &[&str]) -> anyhow::Result { + let output = Command::new("git") + .args(args) + .current_dir(repo_cwd) + .output() + .with_context(|| format!("run git {}", args.join(" ")))?; + if !output.status.success() { + anyhow::bail!("git {} failed with {}", args.join(" "), output.status); + } + let out = String::from_utf8(output.stdout).context("decode git stdout")?; + Ok(out.trim().to_string()) +} + +fn git_diff(repo_cwd: &Path, args: &[&str]) -> anyhow::Result> { + let output = Command::new("git") + .args(args) + .current_dir(repo_cwd) + .output() + .with_context(|| format!("run git {}", args.join(" ")))?; + + let exit_ok = output.status.code().is_some_and(|c| c == 0 || c == 1); + if !exit_ok { + anyhow::bail!("git {} failed with {}", args.join(" "), output.status); + } + Ok(output.stdout) +} + +fn git_patch_between_commits( + repo_cwd: &Path, + base_sha: &str, + commit_sha: &str, +) -> anyhow::Result<(String, Vec)> { + let patch = git_diff( + repo_cwd, + &[ + "diff", + "--no-textconv", + "--no-ext-diff", + "--binary", + base_sha, + commit_sha, + ], + )?; + Ok((base_sha.to_string(), patch)) +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + use tempfile::TempDir; + + #[test] + fn creates_bundle_with_manifest_and_rollout() { + let codex_home = TempDir::new().unwrap(); + let repo_dir = TempDir::new().unwrap(); + let repo_root = repo_dir.path().join("my-repo"); + std::fs::create_dir_all(&repo_root).unwrap(); + std::fs::write(repo_root.join("README.md"), "hi\n").unwrap(); + let init_status = Command::new("git") + .args(["init", "-q"]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(init_status.success()); + let add_status = Command::new("git") + .args(["add", "."]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(add_status.success()); + let commit_status = Command::new("git") + .args([ + "-c", + "user.name=codex", + "-c", + "user.email=codex@example.com", + "commit", + "-m", + "init", + "-q", + ]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(commit_status.success()); + + std::fs::write(repo_root.join("README.md"), "changed\n").unwrap(); + + let rollout_path = repo_root.join("rollout.jsonl"); + let rollout = [ + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"token_count","payload":{"total":1}}"#, + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"event_msg","payload":{"type":"user_message","message":"do the thing"}}"#, + r#"{"timestamp":"2024-01-01T00:00:01Z","type":"event_msg","payload":{"type":"agent_message","message":"ok"}}"#, + ] + .join("\n"); + std::fs::write(&rollout_path, format!("{rollout}\n")).unwrap(); + + let args = CreateEvalCaseArgs { + codex_home: codex_home.path().to_path_buf(), + conversation_id: "conv-1".to_string(), + rollout_path, + start: StartMarker { + kind: StartMarkerKind::RolloutLineIndex, + value: StartMarkerValue::LineIndex(1), + display: "Start now".to_string(), + }, + repo_cwd: repo_root, + repo_snapshot: None, + notes: Notes { + what_went_wrong: "bad".to_string(), + what_good_looks_like: "good".to_string(), + }, + include_logs: true, + logs_bytes: Some(b"logs".to_vec()), + }; + + let out = create_eval_case_bundle(&args).unwrap(); + assert!(!out.case_id.is_empty()); + assert!(out.path.exists()); + assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str()); + assert!(out.path.starts_with(codex_home.path().join("eval-case"))); + assert!(out.case_id.contains("my-repo")); + + let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap(); + let manifest: EvalCaseManifestV1 = serde_json::from_str(&manifest_text).unwrap(); + assert_eq!(manifest.version, "v1"); + assert_eq!(manifest.conversation_id, "conv-1"); + assert_eq!(manifest.notes, args.notes); + assert!(manifest.repo.git.commit != "unknown"); + assert_eq!(manifest.repo.git.canonical_remote, None); + assert_eq!(manifest.repo.git.reproducible, false); + assert_eq!( + manifest.repo.git.reproducible_reason, + Some("no_git_remote".to_string()) + ); + assert_eq!(manifest.repo.git.is_dirty, true); + assert_eq!(manifest.artifacts.include_logs, true); + assert!(!manifest.rollout.start_selector.contains.trim().is_empty()); + + assert!(out.path.join("repo.patch").exists()); + assert!(out.path.join("rollout.jsonl").exists()); + assert_eq!( + std::fs::read(out.path.join("codex-logs.log")).unwrap(), + b"logs".to_vec() + ); + } + + #[test] + fn creates_bundle_from_repo_snapshot_commit() { + let codex_home = TempDir::new().unwrap(); + let repo_dir = TempDir::new().unwrap(); + let repo_root = repo_dir.path().join("my-repo"); + std::fs::create_dir_all(&repo_root).unwrap(); + + std::fs::write(repo_root.join("README.md"), "base\n").unwrap(); + let init_status = Command::new("git") + .args(["init", "-q"]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(init_status.success()); + let add_status = Command::new("git") + .args(["add", "."]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(add_status.success()); + let commit_status = Command::new("git") + .args([ + "-c", + "user.name=codex", + "-c", + "user.email=codex@example.com", + "commit", + "-m", + "base", + "-q", + ]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(commit_status.success()); + + let base_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap(); + + // Create a snapshot commit. + std::fs::write(repo_root.join("README.md"), "snapshot\n").unwrap(); + std::fs::write(repo_root.join("snap.txt"), "snap\n").unwrap(); + let add_status = Command::new("git") + .args(["add", "."]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(add_status.success()); + let commit_status = Command::new("git") + .args([ + "-c", + "user.name=codex", + "-c", + "user.email=codex@example.com", + "commit", + "-m", + "snapshot", + "-q", + ]) + .current_dir(&repo_root) + .status() + .unwrap(); + assert!(commit_status.success()); + let snapshot_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap(); + + // Dirty the working tree after the snapshot commit; this should not affect repo.patch. + std::fs::write(repo_root.join("README.md"), "worktree\n").unwrap(); + + let rollout_path = repo_root.join("rollout.jsonl"); + let rollout = [ + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"event_msg","payload":{"type":"user_message","message":"snapshot run"}}"#, + r#"{"timestamp":"2024-01-01T00:00:01Z","type":"event_msg","payload":{"type":"agent_message","message":"ok"}}"#, + ] + .join("\n"); + std::fs::write(&rollout_path, format!("{rollout}\n")).unwrap(); + + let args = CreateEvalCaseArgs { + codex_home: codex_home.path().to_path_buf(), + conversation_id: "conv-2".to_string(), + rollout_path, + start: StartMarker { + kind: StartMarkerKind::RolloutLineIndex, + value: StartMarkerValue::LineIndex(0), + display: "From: test".to_string(), + }, + repo_cwd: repo_root, + repo_snapshot: Some(RepoSnapshot { + base_sha: base_sha.clone(), + commit_sha: snapshot_sha, + }), + notes: Notes { + what_went_wrong: "bad".to_string(), + what_good_looks_like: "good".to_string(), + }, + include_logs: false, + logs_bytes: None, + }; + + let out = create_eval_case_bundle(&args).unwrap(); + assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str()); + assert!(out.path.starts_with(codex_home.path().join("eval-case"))); + assert!(out.case_id.contains("my-repo")); + let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap(); + let manifest: EvalCaseManifestV1 = serde_json::from_str(&manifest_text).unwrap(); + assert_eq!(manifest.repo.git.commit, base_sha); + + let patch_text = + String::from_utf8(std::fs::read(out.path.join("repo.patch")).unwrap()).unwrap(); + assert!( + patch_text.contains("snapshot"), + "patch should include snapshot commit changes" + ); + assert!( + !patch_text.contains("worktree"), + "patch should not include post-snapshot working tree changes" + ); + assert!(patch_text.contains("snap.txt")); + } + + #[test] + fn slices_rollout_using_start_selector_even_when_line_numbers_shift() { + let rollout = [ + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"token_count","payload":{"total":1}}"#, + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"turn_context","payload":{"cwd":"/tmp","approval_policy":"never","sandbox_policy":"none","model":"x","summary":"none"}}"#, + r#"{"timestamp":"2024-01-01T00:00:00Z","type":"event_msg","payload":{"type":"user_message","message":"run the tests please"}}"#, + r#"{"timestamp":"2024-01-01T00:00:01Z","type":"event_msg","payload":{"type":"agent_message","message":"ok"}}"#, + ] + .join("\n"); + + let selector = RolloutStartSelector { + kind: RolloutStartSelectorKind::EventMsgUserMessageContains, + contains: "run the tests".to_string(), + after_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + }; + + let sliced = slice_rollout_from_selector(&rollout, &selector).unwrap(); + // Should include the turn_context that immediately precedes the matched user_message. + assert!( + sliced + .lines() + .next() + .unwrap() + .contains("\"type\":\"turn_context\"") + ); + assert!(sliced.contains("\"type\":\"user_message\"")); + assert!(!sliced.contains("\"type\":\"token_count\"")); + } +} diff --git a/codex-rs/feedback/src/lib.rs b/codex-rs/feedback/src/lib.rs index eaa949717a8..065993e5b70 100644 --- a/codex-rs/feedback/src/lib.rs +++ b/codex-rs/feedback/src/lib.rs @@ -155,8 +155,28 @@ pub struct CodexLogSnapshot { pub thread_id: String, } +pub struct FeedbackAttachment { + pub filename: String, + pub content_type: Option, + pub bytes: Vec, +} + +impl FeedbackAttachment { + pub fn new(filename: String, content_type: Option, bytes: Vec) -> Self { + Self { + filename, + content_type, + bytes, + } + } +} + impl CodexLogSnapshot { - pub(crate) fn as_bytes(&self) -> &[u8] { + pub fn from_bytes(thread_id: String, bytes: Vec) -> Self { + Self { bytes, thread_id } + } + + pub fn as_bytes(&self) -> &[u8] { &self.bytes } @@ -176,6 +196,25 @@ impl CodexLogSnapshot { include_logs: bool, rollout_path: Option<&std::path::Path>, session_source: Option, + ) -> Result<()> { + self.upload_feedback_with_attachments( + classification, + reason, + include_logs, + rollout_path, + session_source, + Vec::new(), + ) + } + + pub fn upload_feedback_with_attachments( + &self, + classification: &str, + reason: Option<&str>, + include_logs: bool, + rollout_path: Option<&std::path::Path>, + session_source: Option, + extra_attachments: Vec, ) -> Result<()> { use std::collections::BTreeMap; use std::fs; @@ -265,6 +304,15 @@ impl CodexLogSnapshot { })); } + for attachment in extra_attachments { + envelope.add_item(EnvelopeItem::Attachment(Attachment { + buffer: attachment.bytes, + filename: attachment.filename, + content_type: attachment.content_type, + ty: None, + })); + } + client.send_envelope(envelope); client.flush(Some(Duration::from_secs(UPLOAD_TIMEOUT_SECS))); Ok(()) diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index cff2a8ad98d..a0dd122b888 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -141,6 +141,11 @@ pub enum Op { summary: Option, }, + /// Enable or disable repository snapshots (ghost commits) for the remainder of the session. + /// + /// When enabled, Codex will capture per-turn git snapshots and record them in the rollout. + SetRepoSnapshotting { enabled: bool }, + /// Approve a command execution ExecApproval { /// The id of the submission we are approving diff --git a/codex-rs/tui/Cargo.toml b/codex-rs/tui/Cargo.toml index 3248c7377c5..614e719d464 100644 --- a/codex-rs/tui/Cargo.toml +++ b/codex-rs/tui/Cargo.toml @@ -36,6 +36,7 @@ codex-common = { workspace = true, features = [ "sandbox_summary", ] } codex-core = { workspace = true } +codex-eval-case = { workspace = true } codex-feedback = { workspace = true } codex-file-search = { workspace = true } codex-login = { workspace = true } diff --git a/codex-rs/tui/src/app.rs b/codex-rs/tui/src/app.rs index c91eafcbe76..2996a24d355 100644 --- a/codex-rs/tui/src/app.rs +++ b/codex-rs/tui/src/app.rs @@ -806,6 +806,32 @@ impl App { AppEvent::OpenFeedbackConsent { category } => { self.chat_widget.open_feedback_consent(category); } + AppEvent::OpenFeedbackBadResultFork => { + self.chat_widget.open_feedback_bad_result_fork(); + } + AppEvent::OpenEvalCaptureIntro => { + self.chat_widget.open_eval_capture_intro(); + } + AppEvent::EvalCaptureIntroContinue => { + self.chat_widget.on_eval_capture_intro_continue(); + } + AppEvent::EvalCaptureSendConsent { upload } => { + self.chat_widget.on_eval_capture_send_consent(upload); + } + AppEvent::OpenEvalCaptureNotes { start_marker } => { + self.chat_widget.open_eval_capture_notes(start_marker); + } + AppEvent::CreateEvalCaptureBundle { + start_marker, + what_went_wrong, + what_good_looks_like, + } => { + self.chat_widget.create_eval_capture_bundle( + start_marker, + what_went_wrong, + what_good_looks_like, + ); + } AppEvent::LaunchExternalEditor => { if self.chat_widget.external_editor_state() == ExternalEditorState::Active { self.launch_external_editor(tui).await; diff --git a/codex-rs/tui/src/app_event.rs b/codex-rs/tui/src/app_event.rs index 1f99e372e97..da83d02d6dd 100644 --- a/codex-rs/tui/src/app_event.rs +++ b/codex-rs/tui/src/app_event.rs @@ -182,6 +182,33 @@ pub(crate) enum AppEvent { category: FeedbackCategory, }, + /// Open the bad-result fork chooser ("quick feedback" vs "capture eval sample"). + OpenFeedbackBadResultFork, + + /// Start the "capture eval sample" flow from /feedback. + OpenEvalCaptureIntro, + + /// Dismiss the eval capture intro screen and continue to the start picker. + EvalCaptureIntroContinue, + + /// Record whether the user consented to upload the eval sample bundle once captured. + /// This happens before the start message picker so the full flow is consented up front. + EvalCaptureSendConsent { + upload: bool, + }, + + /// Open the freeform notes entry for eval capture. + OpenEvalCaptureNotes { + start_marker: EvalCaptureStartMarker, + }, + + /// Create the eval bundle and report the output path/id in history. + CreateEvalCaptureBundle { + start_marker: EvalCaptureStartMarker, + what_went_wrong: String, + what_good_looks_like: String, + }, + /// Launch the external editor after a normal draw has completed. LaunchExternalEditor, } @@ -193,3 +220,27 @@ pub(crate) enum FeedbackCategory { Bug, Other, } + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct EvalCaptureRepoSnapshot { + pub(crate) ghost_commit: String, + pub(crate) base_sha: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum EvalCaptureStartMarker { + RolloutLineIndex { + index: usize, + timestamp: String, + display: String, + message: String, + repo_snapshot: Option, + }, + #[allow(dead_code)] + RolloutLineTimestamp { + timestamp: String, + display: String, + message: String, + repo_snapshot: Option, + }, +} diff --git a/codex-rs/tui/src/bottom_pane/eval_capture_view.rs b/codex-rs/tui/src/bottom_pane/eval_capture_view.rs new file mode 100644 index 00000000000..eac50ab57c0 --- /dev/null +++ b/codex-rs/tui/src/bottom_pane/eval_capture_view.rs @@ -0,0 +1,376 @@ +use std::cell::RefCell; + +use crossterm::event::KeyCode; +use crossterm::event::KeyEvent; +use crossterm::event::KeyModifiers; +use ratatui::buffer::Buffer; +use ratatui::layout::Rect; +use ratatui::style::Stylize; +use ratatui::text::Line; +use ratatui::widgets::Clear; +use ratatui::widgets::Paragraph; +use ratatui::widgets::StatefulWidgetRef; +use ratatui::widgets::Widget; + +use crate::app_event::AppEvent; +use crate::app_event::EvalCaptureStartMarker; +use crate::app_event_sender::AppEventSender; +use crate::render::renderable::Renderable; + +use super::CancellationEvent; +use super::bottom_pane_view::BottomPaneView; +use super::popup_consts::standard_popup_hint_line; +use super::textarea::TextArea; +use super::textarea::TextAreaState; + +pub(crate) fn eval_capture_intro_params( + app_event_tx: AppEventSender, +) -> super::SelectionViewParams { + let continue_action: super::SelectionAction = Box::new({ + let tx = app_event_tx.clone(); + move |_sender: &AppEventSender| { + tx.send(AppEvent::EvalCaptureIntroContinue); + } + }); + + let basic_feedback_action: super::SelectionAction = Box::new({ + let tx = app_event_tx; + move |_sender: &AppEventSender| { + tx.send(AppEvent::OpenFeedbackConsent { + category: crate::app_event::FeedbackCategory::BadResult, + }); + } + }); + + let header_lines: Vec> = vec![ + Line::from("Capture eval sample".bold()).into(), + Line::from("").into(), + Line::from("Here's everything we capture as an eval sample.".dim()).into(), + Line::from("It's stored locally on your computer:".dim()).into(), + Line::from(vec![" • ".into(), "manifest.json".into()]).into(), + Line::from(vec![" • ".into(), "rollout.jsonl".into()]).into(), + Line::from(vec![" • ".into(), "repo.patch".into()]).into(), + Line::from(vec![" • ".into(), "codex-logs.log".into()]).into(), + Line::from("").into(), + Line::from("Next, you can optionally upload this bundle to the team.".dim()).into(), + ]; + + super::SelectionViewParams { + footer_hint: Some(standard_popup_hint_line()), + title: None, + header: Box::new(crate::render::renderable::ColumnRenderable::with( + header_lines, + )), + items: vec![ + super::SelectionItem { + name: "Continue".to_string(), + description: Some("Create an eval sample bundle locally.".to_string()), + actions: vec![continue_action], + dismiss_on_select: true, + ..Default::default() + }, + super::SelectionItem { + name: "Send basic feedback instead".to_string(), + description: Some( + "Skip eval capture and send basic feedback to the team.".to_string(), + ), + actions: vec![basic_feedback_action], + dismiss_on_select: true, + ..Default::default() + }, + ], + ..Default::default() + } +} + +pub(crate) fn eval_capture_send_consent_params( + app_event_tx: AppEventSender, +) -> super::SelectionViewParams { + let upload_action: super::SelectionAction = Box::new({ + let tx = app_event_tx.clone(); + move |_sender: &AppEventSender| { + tx.send(AppEvent::EvalCaptureSendConsent { upload: true }); + } + }); + + let skip_action: super::SelectionAction = Box::new({ + let tx = app_event_tx; + move |_sender: &AppEventSender| { + tx.send(AppEvent::EvalCaptureSendConsent { upload: false }); + } + }); + + let header_lines: Vec> = vec![ + Line::from("Send eval sample to the codex team?".bold()).into(), + Line::from("").into(), + Line::from("If you choose Yes, it will upload the full bundle".dim()).into(), + Line::from("to the team:".dim()).into(), + Line::from(vec![" • ".into(), "manifest.json".into()]).into(), + Line::from(vec![" • ".into(), "rollout.jsonl".into()]).into(), + Line::from(vec![" • ".into(), "repo.patch".into()]).into(), + Line::from(vec![" • ".into(), "codex-logs.log".into()]).into(), + Line::from("").into(), + Line::from("This may include file paths, code snippets, and tool outputs.".dim()).into(), + ]; + + super::SelectionViewParams { + footer_hint: Some(standard_popup_hint_line()), + title: None, + header: Box::new(crate::render::renderable::ColumnRenderable::with( + header_lines, + )), + items: vec![ + super::SelectionItem { + name: "Yes".to_string(), + description: Some( + "Upload the full bundle to the team for troubleshooting and model improvement." + .to_string(), + ), + actions: vec![upload_action], + dismiss_on_select: true, + ..Default::default() + }, + super::SelectionItem { + name: "No".to_string(), + description: Some("Keep it local only.".to_string()), + actions: vec![skip_action], + dismiss_on_select: true, + ..Default::default() + }, + ], + ..Default::default() + } +} + +pub(crate) fn eval_capture_start_picker_params( + app_event_tx: AppEventSender, + options: Vec, + initial_selected_idx: Option, +) -> super::SelectionViewParams { + let items = options + .into_iter() + .map(|marker| { + let action: super::SelectionAction = Box::new({ + let tx = app_event_tx.clone(); + let marker = marker.clone(); + move |_sender: &AppEventSender| { + tx.send(AppEvent::OpenEvalCaptureNotes { + start_marker: marker.clone(), + }); + } + }); + let disabled_reason = (!marker_has_repo_snapshot(&marker)) + .then_some("No repo snapshot available for this message.".to_string()); + super::SelectionItem { + name: marker_display_name(&marker), + // For the eval-capture start picker, we render the full message directly as the + // row name (including a relative timestamp) to avoid duplicating it in the + // description column. + actions: vec![action], + dismiss_on_select: true, + disabled_reason, + ..Default::default() + } + }) + .collect(); + + super::SelectionViewParams { + footer_hint: Some(standard_popup_hint_line()), + title: Some("Pick the start message".to_string()), + subtitle: Some( + "Pick the first instruction for the specific code change or interaction you wanted to give feedback on." + .to_string(), + ), + items, + initial_selected_idx, + show_numbers: true, + ..Default::default() + } +} + +fn marker_display_name(marker: &EvalCaptureStartMarker) -> String { + match marker { + EvalCaptureStartMarker::RolloutLineIndex { display, .. } + | EvalCaptureStartMarker::RolloutLineTimestamp { display, .. } => display.clone(), + } +} + +fn marker_has_repo_snapshot(marker: &EvalCaptureStartMarker) -> bool { + match marker { + EvalCaptureStartMarker::RolloutLineIndex { repo_snapshot, .. } + | EvalCaptureStartMarker::RolloutLineTimestamp { repo_snapshot, .. } => { + repo_snapshot.is_some() + } + } +} + +pub(crate) struct EvalCaptureNotesView { + start_marker: EvalCaptureStartMarker, + app_event_tx: AppEventSender, + + textarea: TextArea, + textarea_state: RefCell, + complete: bool, +} + +impl EvalCaptureNotesView { + pub(crate) fn new(start_marker: EvalCaptureStartMarker, app_event_tx: AppEventSender) -> Self { + Self { + start_marker, + app_event_tx, + textarea: TextArea::new(), + textarea_state: RefCell::new(TextAreaState::default()), + complete: false, + } + } + + fn submit(&mut self) { + let what_went_wrong = self.textarea.text().trim().to_string(); + self.app_event_tx.send(AppEvent::CreateEvalCaptureBundle { + start_marker: self.start_marker.clone(), + what_went_wrong, + what_good_looks_like: String::new(), + }); + self.complete = true; + } +} + +impl BottomPaneView for EvalCaptureNotesView { + fn handle_key_event(&mut self, key_event: KeyEvent) { + match key_event { + KeyEvent { + code: KeyCode::Esc, .. + } => { + self.on_ctrl_c(); + } + KeyEvent { + code: KeyCode::Enter, + modifiers: KeyModifiers::NONE, + .. + } => { + self.submit(); + } + other => { + self.textarea.input(other); + } + } + } + + fn on_ctrl_c(&mut self) -> CancellationEvent { + self.complete = true; + CancellationEvent::Handled + } + + fn is_complete(&self) -> bool { + self.complete + } + + fn handle_paste(&mut self, pasted: String) -> bool { + if pasted.is_empty() { + return false; + } + self.textarea.insert_str(&pasted); + true + } +} + +impl Renderable for EvalCaptureNotesView { + fn desired_height(&self, width: u16) -> u16 { + self.header_lines(width).len() as u16 + self.input_height(width) + 2u16 + } + + fn cursor_pos(&self, area: Rect) -> Option<(u16, u16)> { + if area.height < 2 || area.width <= 2 { + return None; + } + let text_area_height = self.input_height(area.width).saturating_sub(1); + if text_area_height == 0 { + return None; + } + let top_line_count = self.header_lines(area.width).len() as u16; + let textarea_rect = Rect { + x: area.x.saturating_add(2), + y: area.y.saturating_add(top_line_count).saturating_add(1), + width: area.width.saturating_sub(2), + height: text_area_height, + }; + let state = *self.textarea_state.borrow(); + self.textarea.cursor_pos_with_state(textarea_rect, state) + } + + fn render(&self, area: Rect, buf: &mut Buffer) { + if area.height == 0 || area.width == 0 { + return; + } + + let placeholder = "Type here."; + + let input_height = self.input_height(area.width); + let header_lines = self.header_lines(area.width); + let header_height = header_lines.len() as u16; + + let title_area = Rect { + x: area.x, + y: area.y, + width: area.width, + height: header_height, + }; + let input_area = Rect { + x: area.x, + y: area.y.saturating_add(header_height), + width: area.width, + height: input_height, + }; + + Clear.render(area, buf); + + Paragraph::new(header_lines).render(title_area, buf); + + let textarea_rect = Rect { + x: input_area.x.saturating_add(2), + y: input_area.y.saturating_add(1), + width: input_area.width.saturating_sub(2), + height: input_area.height.saturating_sub(1), + }; + let mut state = self.textarea_state.borrow_mut(); + StatefulWidgetRef::render_ref(&(&self.textarea), textarea_rect, buf, &mut state); + if self.textarea.text().is_empty() { + Paragraph::new(Line::from(placeholder.dim())).render(textarea_rect, buf); + } + + let hint_area = Rect { + x: area.x, + y: area + .y + .saturating_add(header_height) + .saturating_add(input_height) + .saturating_add(1), + width: area.width, + height: 1, + }; + Paragraph::new(Line::from("Enter to continue • Esc to go back".dim())) + .render(hint_area, buf); + } +} + +impl EvalCaptureNotesView { + fn header_lines(&self, width: u16) -> Vec> { + let title = "What went wrong with this interaction, and what should have happened for this to be a correct, delightful response?"; + let subtitle = "Please be specific, and can refer to any aspect of model behavior, including tool calls, code changes, or messages."; + let usable_width = width.saturating_sub(2).max(1) as usize; + let mut lines = Vec::new(); + for line in textwrap::wrap(title, usable_width) { + lines.push(Line::from(line.into_owned().bold())); + } + for line in textwrap::wrap(subtitle, usable_width) { + lines.push(Line::from(line.into_owned().dim())); + } + lines + } + + fn input_height(&self, width: u16) -> u16 { + let usable_width = width.saturating_sub(2); + let text_height = self.textarea.desired_height(usable_width).clamp(1, 8); + text_height.saturating_add(1).min(9) + } +} diff --git a/codex-rs/tui/src/bottom_pane/feedback_view.rs b/codex-rs/tui/src/bottom_pane/feedback_view.rs index c563ab8e90b..18031ca1220 100644 --- a/codex-rs/tui/src/bottom_pane/feedback_view.rs +++ b/codex-rs/tui/src/bottom_pane/feedback_view.rs @@ -387,7 +387,11 @@ fn make_feedback_item( category: FeedbackCategory, ) -> super::SelectionItem { let action: super::SelectionAction = Box::new(move |_sender: &AppEventSender| { - app_event_tx.send(AppEvent::OpenFeedbackConsent { category }); + if category == FeedbackCategory::BadResult { + app_event_tx.send(AppEvent::OpenFeedbackBadResultFork); + } else { + app_event_tx.send(AppEvent::OpenFeedbackConsent { category }); + } }); super::SelectionItem { name: name.to_string(), @@ -398,6 +402,52 @@ fn make_feedback_item( } } +pub(crate) fn feedback_bad_result_fork_params( + app_event_tx: AppEventSender, +) -> super::SelectionViewParams { + let quick_action: super::SelectionAction = Box::new({ + let tx = app_event_tx.clone(); + move |_sender: &AppEventSender| { + tx.send(AppEvent::OpenFeedbackConsent { + category: FeedbackCategory::BadResult, + }); + } + }); + + let capture_action: super::SelectionAction = Box::new({ + let tx = app_event_tx; + move |_sender: &AppEventSender| { + tx.send(AppEvent::OpenEvalCaptureIntro); + } + }); + + super::SelectionViewParams { + title: Some("Bad result".to_string()), + subtitle: Some("Choose the fastest way to help us improve.".to_string()), + items: vec![ + super::SelectionItem { + name: "Quick feedback".to_string(), + description: Some( + "Send a short note (and optionally logs) to maintainers.".to_string(), + ), + actions: vec![quick_action], + dismiss_on_select: true, + ..Default::default() + }, + super::SelectionItem { + name: "Capture eval sample".to_string(), + description: Some( + "Create a local bundle you can rerun as an eval case later.".to_string(), + ), + actions: vec![capture_action], + dismiss_on_select: true, + ..Default::default() + }, + ], + ..Default::default() + } +} + /// Build the upload consent popup params for a given feedback category. pub(crate) fn feedback_upload_consent_params( app_event_tx: AppEventSender, diff --git a/codex-rs/tui/src/bottom_pane/list_selection_view.rs b/codex-rs/tui/src/bottom_pane/list_selection_view.rs index e387b668613..d4e28f9be61 100644 --- a/codex-rs/tui/src/bottom_pane/list_selection_view.rs +++ b/codex-rs/tui/src/bottom_pane/list_selection_view.rs @@ -54,6 +54,7 @@ pub(crate) struct SelectionViewParams { pub items: Vec, pub is_searchable: bool, pub search_placeholder: Option, + pub show_numbers: bool, pub header: Box, pub initial_selected_idx: Option, } @@ -67,12 +68,52 @@ impl Default for SelectionViewParams { items: Vec::new(), is_searchable: false, search_placeholder: None, + show_numbers: true, header: Box::new(()), initial_selected_idx: None, } } } +struct WrappedText { + text: String, + dim: bool, +} + +impl WrappedText { + fn dim(text: String) -> Self { + Self { text, dim: true } + } + + fn wrapped_lines(&self, width: u16) -> Vec> { + let usable_width = width.max(1) as usize; + textwrap::wrap(&self.text, usable_width) + .into_iter() + .map(|line| { + let span = Span::from(line.into_owned()); + if self.dim { + Line::from(span.dim()) + } else { + Line::from(span) + } + }) + .collect() + } +} + +impl Renderable for WrappedText { + fn render(&self, area: Rect, buf: &mut Buffer) { + Paragraph::new(self.wrapped_lines(area.width)).render(area, buf); + } + + fn desired_height(&self, width: u16) -> u16 { + self.wrapped_lines(width) + .len() + .try_into() + .unwrap_or(u16::MAX) + } +} + pub(crate) struct ListSelectionView { footer_hint: Option>, items: Vec, @@ -80,6 +121,7 @@ pub(crate) struct ListSelectionView { complete: bool, app_event_tx: AppEventSender, is_searchable: bool, + show_numbers: bool, search_query: String, search_placeholder: Option, filtered_indices: Vec, @@ -93,7 +135,7 @@ impl ListSelectionView { let mut header = params.header; if params.title.is_some() || params.subtitle.is_some() { let title = params.title.map(|title| Line::from(title.bold())); - let subtitle = params.subtitle.map(|subtitle| Line::from(subtitle.dim())); + let subtitle = params.subtitle.map(WrappedText::dim); header = Box::new(ColumnRenderable::with([ header, Box::new(title), @@ -107,6 +149,7 @@ impl ListSelectionView { complete: false, app_event_tx, is_searchable: params.is_searchable, + show_numbers: params.show_numbers, search_query: String::new(), search_placeholder: if params.is_searchable { params.search_placeholder @@ -198,13 +241,9 @@ impl ListSelectionView { }; let name_with_marker = format!("{name}{marker}"); let n = visible_idx + 1; - let wrap_prefix = if self.is_searchable { - // The number keys don't work when search is enabled (since we let the - // numbers be used for the search query). - format!("{prefix} ") - } else { - format!("{prefix} {n}. ") - }; + let wrap_prefix = (self.show_numbers && !self.is_searchable) + .then(|| format!("{prefix} {n}. ")) + .unwrap_or_else(|| format!("{prefix} ")); let wrap_prefix_width = UnicodeWidthStr::width(wrap_prefix.as_str()); let display_name = format!("{wrap_prefix}{name_with_marker}"); let description = is_selected @@ -386,6 +425,7 @@ impl BottomPaneView for ListSelectionView { .to_digit(10) .map(|d| d as usize) .and_then(|d| d.checked_sub(1)) + .filter(|_| self.show_numbers) && idx < self.items.len() && self .items diff --git a/codex-rs/tui/src/bottom_pane/mod.rs b/codex-rs/tui/src/bottom_pane/mod.rs index 7634f699aa0..945f43367ad 100644 --- a/codex-rs/tui/src/bottom_pane/mod.rs +++ b/codex-rs/tui/src/bottom_pane/mod.rs @@ -26,6 +26,7 @@ mod chat_composer; mod chat_composer_history; mod command_popup; pub mod custom_prompt_view; +mod eval_capture_view; mod experimental_features_view; mod file_search_popup; mod footer; @@ -34,6 +35,11 @@ mod prompt_args; mod skill_popup; pub(crate) use list_selection_view::SelectionViewParams; mod feedback_view; +pub(crate) use eval_capture_view::EvalCaptureNotesView; +pub(crate) use eval_capture_view::eval_capture_intro_params; +pub(crate) use eval_capture_view::eval_capture_send_consent_params; +pub(crate) use eval_capture_view::eval_capture_start_picker_params; +pub(crate) use feedback_view::feedback_bad_result_fork_params; pub(crate) use feedback_view::feedback_selection_params; pub(crate) use feedback_view::feedback_upload_consent_params; mod paste_burst; diff --git a/codex-rs/tui/src/bottom_pane/selection_popup_common.rs b/codex-rs/tui/src/bottom_pane/selection_popup_common.rs index 48adef9b2c9..80f584888cd 100644 --- a/codex-rs/tui/src/bottom_pane/selection_popup_common.rs +++ b/codex-rs/tui/src/bottom_pane/selection_popup_common.rs @@ -110,7 +110,10 @@ fn compute_desc_col( let max_name_width = rows_all .iter() .enumerate() - .filter(|(i, _)| visible_range.contains(i)) + .filter(|(i, row)| { + visible_range.contains(i) + && (row.description.is_some() || row.disabled_reason.is_some()) + }) .map(|(_, r)| { let mut spans: Vec = vec![r.name.clone().into()]; if r.disabled_reason.is_some() { diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index 819ea2b9558..6a63e75dbce 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -132,7 +132,9 @@ use self::session_header::SessionHeader; use crate::streaming::controller::StreamController; use std::path::Path; +use chrono::DateTime; use chrono::Local; +use chrono::Utc; use codex_common::approval_presets::ApprovalPreset; use codex_common::approval_presets::builtin_approval_presets; use codex_core::AuthManager; @@ -368,6 +370,9 @@ pub(crate) struct ChatWidget { feedback: codex_feedback::CodexFeedback, // Current session rollout path (if known) current_rollout_path: Option, + // Whether the user consented to upload the captured eval sample to the team. + // This is captured before the start picker so the flow is consented up front. + eval_capture_send_to_team: Option, external_editor_state: ExternalEditorState, } @@ -438,6 +443,9 @@ impl ChatWidget { self.set_skills(None); self.conversation_id = Some(event.session_id); self.current_rollout_path = Some(event.rollout_path.clone()); + // Repo snapshots are stored locally and are required for capturing eval cases + // from arbitrary earlier user messages. + self.submit_op(Op::SetRepoSnapshotting { enabled: true }); let initial_messages = event.initial_messages.clone(); let model_for_header = event.model.clone(); self.session_header.set_model(&model_for_header); @@ -506,6 +514,479 @@ impl ChatWidget { self.request_redraw(); } + pub(crate) fn open_feedback_bad_result_fork(&mut self) { + let params = crate::bottom_pane::feedback_bad_result_fork_params(self.app_event_tx.clone()); + self.bottom_pane.show_selection_view(params); + self.request_redraw(); + } + + pub(crate) fn open_eval_capture_intro(&mut self) { + self.eval_capture_send_to_team = None; + if !crate::eval_capture_state::should_show_eval_capture_intro(&self.config.codex_home) { + self.open_eval_capture_send_consent(); + return; + } + + let params = crate::bottom_pane::eval_capture_intro_params(self.app_event_tx.clone()); + self.bottom_pane.show_selection_view(params); + self.request_redraw(); + } + + pub(crate) fn on_eval_capture_intro_continue(&mut self) { + if let Err(err) = + crate::eval_capture_state::persist_eval_capture_intro_dismissed(&self.config.codex_home) + { + self.add_error_message(format!("Failed to persist eval capture preference: {err}")); + } + + self.open_eval_capture_send_consent(); + } + + fn open_eval_capture_send_consent(&mut self) { + let params = + crate::bottom_pane::eval_capture_send_consent_params(self.app_event_tx.clone()); + self.bottom_pane.show_selection_view(params); + self.request_redraw(); + } + + pub(crate) fn on_eval_capture_send_consent(&mut self, upload: bool) { + self.eval_capture_send_to_team = Some(upload); + self.open_eval_capture_start_picker(); + } + + fn open_eval_capture_start_picker(&mut self) { + let Some(rollout_path) = self.current_rollout_path.as_ref() else { + self.add_error_message( + "No rollout file for this session; cannot capture eval sample.".to_string(), + ); + return; + }; + + let text = match std::fs::read_to_string(rollout_path) { + Ok(text) => text, + Err(err) => { + self.add_error_message(format!( + "Failed to read rollout file {}: {err}", + rollout_path.display() + )); + return; + } + }; + // List every user message as a candidate start point, and attach the next + // ghost snapshot to the most recent user message. Markers without snapshots + // will be disabled in the picker. + let mut options = Vec::new(); + let mut last_user_marker_idx: Option = None; + let mut max_timestamp: Option> = None; + for (idx, line) in text.lines().enumerate() { + let Ok(rollout_line) = + serde_json::from_str::(line) + else { + continue; + }; + if let Some(ts) = parse_rollout_timestamp_utc(&rollout_line.timestamp) { + max_timestamp = Some(max_timestamp.map_or(ts, |cur| cur.max(ts))); + } + match rollout_line.item { + codex_protocol::protocol::RolloutItem::ResponseItem( + codex_protocol::models::ResponseItem::Message { role, content, .. }, + ) if role == "user" => { + let Some(message) = extract_user_message_text(&content) else { + continue; + }; + if !should_show_eval_capture_start_message(&message) { + continue; + } + options.push(crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + index: idx, + timestamp: rollout_line.timestamp.clone(), + // Filled in below once we know the latest timestamp in the rollout. + display: String::new(), + message, + repo_snapshot: None, + }); + last_user_marker_idx = Some(options.len().saturating_sub(1)); + } + codex_protocol::protocol::RolloutItem::ResponseItem( + codex_protocol::models::ResponseItem::GhostSnapshot { ghost_commit }, + ) => { + if let Some(marker_idx) = last_user_marker_idx + && let Some(base_sha) = ghost_commit.parent() + { + if let Some(crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + repo_snapshot, + .. + }) = options.get_mut(marker_idx) + { + *repo_snapshot = Some(crate::app_event::EvalCaptureRepoSnapshot { + ghost_commit: ghost_commit.id().to_string(), + base_sha: Some(base_sha.to_string()), + }); + } + } + } + _ => {} + } + } + + if let Some(now) = max_timestamp { + for marker in &mut options { + match marker { + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + timestamp, + display, + message, + .. + } + | crate::app_event::EvalCaptureStartMarker::RolloutLineTimestamp { + timestamp, + display, + message, + .. + } => { + let Some(then) = parse_rollout_timestamp_utc(timestamp) else { + *display = message.clone(); + continue; + }; + let delta = format_relative_time_ago(now, then); + *display = format!("[{delta}] {message}"); + } + } + } + } else { + for marker in &mut options { + match marker { + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + display, + message, + .. + } + | crate::app_event::EvalCaptureStartMarker::RolloutLineTimestamp { + display, + message, + .. + } => { + *display = message.clone(); + } + } + } + } + + options.sort_by(|a, b| match (a, b) { + ( + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { index: a, .. }, + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { index: b, .. }, + ) => b.cmp(a), + _ => std::cmp::Ordering::Equal, + }); + + if !options.iter().any(|marker| match marker { + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + repo_snapshot, .. + } + | crate::app_event::EvalCaptureStartMarker::RolloutLineTimestamp { + repo_snapshot, + .. + } => repo_snapshot.is_some(), + }) { + self.add_error_message( + "No repo snapshots are available yet in this session. Try again after another message/turn (or in a new session)." + .to_string(), + ); + self.open_feedback_consent(crate::app_event::FeedbackCategory::BadResult); + return; + } + + let initial_selected_idx = options.iter().position(|marker| match marker { + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + repo_snapshot, .. + } + | crate::app_event::EvalCaptureStartMarker::RolloutLineTimestamp { + repo_snapshot, + .. + } => repo_snapshot.is_some(), + }); + let params = crate::bottom_pane::eval_capture_start_picker_params( + self.app_event_tx.clone(), + options, + initial_selected_idx, + ); + self.bottom_pane.show_selection_view(params); + self.request_redraw(); + } + + pub(crate) fn open_eval_capture_notes( + &mut self, + start_marker: crate::app_event::EvalCaptureStartMarker, + ) { + let view = + crate::bottom_pane::EvalCaptureNotesView::new(start_marker, self.app_event_tx.clone()); + self.bottom_pane.show_view(Box::new(view)); + self.request_redraw(); + } + + pub(crate) fn create_eval_capture_bundle( + &mut self, + start_marker: crate::app_event::EvalCaptureStartMarker, + what_went_wrong: String, + what_good_looks_like: String, + ) { + let Some(conversation_id) = self.conversation_id else { + self.add_error_message( + "No active conversation; cannot capture eval sample.".to_string(), + ); + return; + }; + let Some(rollout_path) = self.current_rollout_path.clone() else { + self.add_error_message( + "No rollout file for this session; cannot capture eval sample.".to_string(), + ); + return; + }; + + let (start, repo_snapshot) = match start_marker { + crate::app_event::EvalCaptureStartMarker::RolloutLineIndex { + index, + timestamp: _, + display, + message: _, + repo_snapshot, + } => { + let Some(snapshot) = repo_snapshot else { + self.add_error_message( + "No repo snapshot available for the selected start marker; cannot capture eval sample." + .to_string(), + ); + return; + }; + let Some(base_sha) = snapshot.base_sha else { + self.add_error_message( + "No base sha for selected repo snapshot; cannot capture eval sample." + .to_string(), + ); + return; + }; + let repo_snapshot = Some(codex_eval_case::RepoSnapshot { + base_sha, + commit_sha: snapshot.ghost_commit, + }); + ( + codex_eval_case::StartMarker { + kind: codex_eval_case::StartMarkerKind::RolloutLineIndex, + value: codex_eval_case::StartMarkerValue::LineIndex(index as u64), + display, + }, + repo_snapshot, + ) + } + crate::app_event::EvalCaptureStartMarker::RolloutLineTimestamp { + timestamp, + display, + message: _, + repo_snapshot, + } => { + let Some(snapshot) = repo_snapshot else { + self.add_error_message( + "No repo snapshot available for the selected start marker; cannot capture eval sample." + .to_string(), + ); + return; + }; + let Some(base_sha) = snapshot.base_sha else { + self.add_error_message( + "No base sha for selected repo snapshot; cannot capture eval sample." + .to_string(), + ); + return; + }; + let repo_snapshot = Some(codex_eval_case::RepoSnapshot { + base_sha, + commit_sha: snapshot.ghost_commit, + }); + ( + codex_eval_case::StartMarker { + kind: codex_eval_case::StartMarkerKind::RolloutLineTimestamp, + value: codex_eval_case::StartMarkerValue::Timestamp(timestamp), + display, + }, + repo_snapshot, + ) + } + }; + + let logs_bytes = Some( + self.feedback + .snapshot(Some(conversation_id)) + .as_bytes() + .to_vec(), + ); + + let args = codex_eval_case::CreateEvalCaseArgs { + codex_home: self.config.codex_home.clone(), + conversation_id: conversation_id.to_string(), + rollout_path, + start, + repo_cwd: self.config.cwd.clone(), + repo_snapshot, + notes: codex_eval_case::Notes { + what_went_wrong, + what_good_looks_like, + }, + include_logs: true, + logs_bytes, + }; + + match codex_eval_case::create_eval_case_bundle(&args) { + Ok(result) => { + use ratatui::style::Stylize as _; + use ratatui::text::Line; + + let case_id = result.case_id; + let path = result.path.display().to_string(); + let lines = vec![ + Line::from("• Eval sample captured (local only).".to_string()), + "".into(), + Line::from(vec![" Case ID: ".into(), case_id.clone().bold()]), + Line::from(vec![" Path: ".into(), path.clone().cyan().underlined()]), + ]; + self.app_event_tx + .send(crate::app_event::AppEvent::InsertHistoryCell(Box::new( + crate::history_cell::PlainHistoryCell::new(lines), + ))); + + if self.eval_capture_send_to_team.unwrap_or(false) { + self.upload_eval_capture_bundle(case_id, path); + } else { + self.eval_capture_upload_skipped(case_id, path); + } + } + Err(err) => { + self.app_event_tx + .send(crate::app_event::AppEvent::InsertHistoryCell(Box::new( + crate::history_cell::new_error_event(format!( + "Failed to create eval sample bundle: {err}" + )), + ))); + } + } + } + + pub(crate) fn upload_eval_capture_bundle(&mut self, case_id: String, path: String) { + let Some(conversation_id) = self.conversation_id else { + self.add_error_message( + "No active conversation; cannot upload eval sample.".to_string(), + ); + return; + }; + + let bundle_dir = std::path::PathBuf::from(&path); + let manifest_path = bundle_dir.join("manifest.json"); + let rollout_path = bundle_dir.join("rollout.jsonl"); + let repo_patch_path = bundle_dir.join("repo.patch"); + let logs_path = bundle_dir.join("codex-logs.log"); + + let read_bytes = |p: &std::path::Path| std::fs::read(p).map_err(anyhow::Error::from); + + let logs_bytes = match read_bytes(&logs_path) { + Ok(bytes) => bytes, + Err(err) => { + self.add_error_message(format!( + "Failed to read eval sample logs {}: {err}", + logs_path.display() + )); + return; + } + }; + + let snapshot = self.feedback.snapshot(Some(conversation_id)); + let snapshot = + codex_feedback::CodexLogSnapshot::from_bytes(snapshot.thread_id.clone(), logs_bytes); + + let extra_attachments = match (read_bytes(&manifest_path), read_bytes(&repo_patch_path)) { + (Ok(manifest_bytes), Ok(repo_patch_bytes)) => vec![ + codex_feedback::FeedbackAttachment::new( + "manifest.json".to_string(), + Some("application/json".to_string()), + manifest_bytes, + ), + codex_feedback::FeedbackAttachment::new( + "repo.patch".to_string(), + Some("text/plain".to_string()), + repo_patch_bytes, + ), + ], + (manifest_err, repo_err) => { + if let Err(err) = manifest_err { + self.add_error_message(format!( + "Failed to read eval sample manifest {}: {err}", + manifest_path.display() + )); + } + if let Err(err) = repo_err { + self.add_error_message(format!( + "Failed to read eval sample repo.patch {}: {err}", + repo_patch_path.display() + )); + } + return; + } + }; + + let reason = format!("Eval sample {case_id}"); + let result = snapshot.upload_feedback_with_attachments( + "bad_result", + Some(reason.as_str()), + true, + Some(rollout_path.as_path()), + Some(codex_core::protocol::SessionSource::Cli), + extra_attachments, + ); + + match result { + Ok(()) => { + use ratatui::style::Stylize as _; + use ratatui::text::Line; + + let lines = vec![ + Line::from("• Eval sample uploaded.".to_string()), + "".into(), + Line::from(vec![ + " Thread ID: ".into(), + snapshot.thread_id.clone().bold(), + ]), + ]; + self.app_event_tx + .send(crate::app_event::AppEvent::InsertHistoryCell(Box::new( + crate::history_cell::PlainHistoryCell::new(lines), + ))); + } + Err(err) => { + self.app_event_tx + .send(crate::app_event::AppEvent::InsertHistoryCell(Box::new( + crate::history_cell::new_error_event(format!( + "Failed to upload eval sample: {err}" + )), + ))); + } + } + self.request_redraw(); + } + + pub(crate) fn eval_capture_upload_skipped(&mut self, case_id: String, path: String) { + let _ = (case_id, path); + use ratatui::text::Line; + + // The "captured" message already prints the case id/path. Avoid duplicating it here. + let lines = vec![Line::from( + "• Eval sample saved locally (not uploaded).".to_string(), + )]; + self.app_event_tx + .send(crate::app_event::AppEvent::InsertHistoryCell(Box::new( + crate::history_cell::PlainHistoryCell::new(lines), + ))); + self.request_redraw(); + } + fn on_agent_message(&mut self, message: String) { // If we have a stream_controller, then the final agent message is redundant and will be a // duplicate of what has already been streamed. @@ -1482,6 +1963,7 @@ impl ChatWidget { last_rendered_width: std::cell::Cell::new(None), feedback, current_rollout_path: None, + eval_capture_send_to_team: None, external_editor_state: ExternalEditorState::Closed, }; @@ -1569,6 +2051,7 @@ impl ChatWidget { last_rendered_width: std::cell::Cell::new(None), feedback, current_rollout_path: None, + eval_capture_send_to_team: None, external_editor_state: ExternalEditorState::Closed, }; @@ -3695,6 +4178,73 @@ async fn fetch_rate_limits(base_url: String, auth: CodexAuth) -> Option Option { + let mut combined = String::new(); + for item in content { + let text = match item { + codex_protocol::models::ContentItem::InputText { text } + | codex_protocol::models::ContentItem::OutputText { text } => text.as_str(), + codex_protocol::models::ContentItem::InputImage { .. } => continue, + }; + if !combined.is_empty() { + combined.push(' '); + } + combined.push_str(text); + } + + let mut normalized = String::new(); + for part in combined.split_whitespace() { + if !normalized.is_empty() { + normalized.push(' '); + } + normalized.push_str(part); + } + + let trimmed = normalized.trim(); + if trimmed.is_empty() { + None + } else { + // Keep the picker readable even for very long user messages. + Some(truncate_text(trimmed, 800)) + } +} + +fn should_show_eval_capture_start_message(message: &str) -> bool { + // The rollout includes a few synthetic "user" messages (environment context, + // AGENTS instructions) that should not be presented back to the user as start markers. + let trimmed = message.trim_start(); + if trimmed.starts_with("") { + return false; + } + if trimmed.starts_with("# AGENTS.md instructions") { + return false; + } + true +} + +fn parse_rollout_timestamp_utc(timestamp: &str) -> Option> { + DateTime::parse_from_rfc3339(timestamp) + .ok() + .map(|dt| dt.with_timezone(&Utc)) +} + +fn format_relative_time_ago(now: DateTime, then: DateTime) -> String { + let secs = now.signed_duration_since(then).num_seconds().max(0); + if secs < 60 { + return format!("{secs}s ago"); + } + let mins = secs / 60; + if mins < 60 { + return format!("{mins}m ago"); + } + let hours = mins / 60; + if hours < 24 { + return format!("{hours}h ago"); + } + let days = hours / 24; + format!("{days}d ago") +} + #[cfg(test)] pub(crate) fn show_review_commit_picker_with_entries( chat: &mut ChatWidget, diff --git a/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_intro_popup.snap b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_intro_popup.snap new file mode 100644 index 00000000000..48eabcaef0e --- /dev/null +++ b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_intro_popup.snap @@ -0,0 +1,21 @@ +--- +source: tui/src/chatwidget/tests.rs +assertion_line: 2190 +expression: popup +--- + Capture eval sample + + Here's everything we capture as an eval sample. + It's stored locally on your computer: + • manifest.json + • rollout.jsonl + • repo.patch + • codex-logs.log + + Next, you can optionally upload this bundle to the team. + +› 1. Continue Create an eval sample bundle locally. + 2. Send basic feedback instead Skip eval capture and send basic feedback to + the team. + + Press enter to confirm or esc to go back diff --git a/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_start_picker_popup.snap b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_start_picker_popup.snap new file mode 100644 index 00000000000..72a4e381c53 --- /dev/null +++ b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_start_picker_popup.snap @@ -0,0 +1,11 @@ +--- +source: tui/src/chatwidget/tests.rs +expression: popup +--- + Pick the start message + Pick the first instruction for the specific code change or interaction you + wanted to give feedback on. + +› 1. [0s ago] Please help me fix the build errors + + Press enter to confirm or esc to go back diff --git a/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_upload_consent_popup.snap b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_upload_consent_popup.snap new file mode 100644 index 00000000000..c78f96fea75 --- /dev/null +++ b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__eval_capture_upload_consent_popup.snap @@ -0,0 +1,20 @@ +--- +source: tui/src/chatwidget/tests.rs +expression: popup +--- + Send eval sample to the codex team? + + If you choose Yes, it will upload the full bundle + to the team: + • manifest.json + • rollout.jsonl + • repo.patch + • codex-logs.log + + This may include file paths, code snippets, and tool outputs. + +› 1. Yes Upload the full bundle to the team for troubleshooting and model + improvement. + 2. No Keep it local only. + + Press enter to confirm or esc to go back diff --git a/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__feedback_bad_result_fork_popup.snap b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__feedback_bad_result_fork_popup.snap new file mode 100644 index 00000000000..e4a51475261 --- /dev/null +++ b/codex-rs/tui/src/chatwidget/snapshots/codex_tui__chatwidget__tests__feedback_bad_result_fork_popup.snap @@ -0,0 +1,11 @@ +--- +source: tui/src/chatwidget/tests.rs +expression: popup +--- + Bad result + Choose the fastest way to help us improve. + +› 1. Quick feedback Send a short note (and optionally logs) to + maintainers. + 2. Capture eval sample Create a local bundle you can rerun as an eval case + later. diff --git a/codex-rs/tui/src/chatwidget/tests.rs b/codex-rs/tui/src/chatwidget/tests.rs index d8d50e5db0e..e08ad69d938 100644 --- a/codex-rs/tui/src/chatwidget/tests.rs +++ b/codex-rs/tui/src/chatwidget/tests.rs @@ -407,6 +407,7 @@ async fn make_chatwidget_manual( last_rendered_width: std::cell::Cell::new(None), feedback: codex_feedback::CodexFeedback::new(), current_rollout_path: None, + eval_capture_send_to_team: None, external_editor_state: ExternalEditorState::Closed, }; (widget, rx, op_rx) @@ -2167,6 +2168,88 @@ async fn feedback_upload_consent_popup_snapshot() { assert_snapshot!("feedback_upload_consent_popup", popup); } +#[tokio::test] +async fn feedback_bad_result_fork_popup_snapshot() { + let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await; + + chat.open_feedback_bad_result_fork(); + + let popup = render_bottom_popup(&chat, 80); + assert_snapshot!("feedback_bad_result_fork_popup", popup); +} + +#[tokio::test] +async fn eval_capture_intro_popup_snapshot() { + let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await; + + // Ensure host state doesn't leak into snapshots. + let _ = std::fs::remove_file(chat.config.codex_home.join("eval_capture.json")); + + chat.open_eval_capture_intro(); + + let popup = render_bottom_popup(&chat, 80); + assert_snapshot!("eval_capture_intro_popup", popup); +} + +#[tokio::test] +async fn eval_capture_upload_consent_popup_snapshot() { + let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await; + + let params = crate::bottom_pane::eval_capture_send_consent_params(chat.app_event_tx.clone()); + chat.bottom_pane.show_selection_view(params); + chat.request_redraw(); + + let popup = render_bottom_popup(&chat, 80); + assert_snapshot!("eval_capture_upload_consent_popup", popup); +} + +#[tokio::test] +async fn eval_capture_start_picker_popup_snapshot() { + let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await; + + let rollout_file = NamedTempFile::new().unwrap(); + let user_line = serde_json::json!({ + "timestamp": "2025-09-05T16:53:11.850Z", + "type": "response_item", + "payload": { + "type": "message", + "role": "user", + "content": [{ + "type": "input_text", + "text": "Please help me fix the build errors", + }], + }, + }); + let ghost_line = serde_json::json!({ + "timestamp": "2025-09-05T16:53:12.000Z", + "type": "response_item", + "payload": { + "type": "ghost_snapshot", + "ghost_commit": { + "id": "ghost-sha", + "parent": "base-sha", + "preexisting_untracked_files": [], + "preexisting_untracked_dirs": [], + }, + }, + }); + std::fs::write( + rollout_file.path(), + format!( + "{}\n{}\n", + serde_json::to_string(&user_line).unwrap(), + serde_json::to_string(&ghost_line).unwrap() + ), + ) + .unwrap(); + chat.current_rollout_path = Some(rollout_file.path().to_path_buf()); + + chat.open_eval_capture_start_picker(); + + let popup = render_bottom_popup(&chat, 80); + assert_snapshot!("eval_capture_start_picker_popup", popup); +} + #[tokio::test] async fn reasoning_popup_escape_returns_to_model_popup() { let (mut chat, _rx, _op_rx) = make_chatwidget_manual(Some("gpt-5.1-codex-max")).await; diff --git a/codex-rs/tui/src/eval_capture_state.rs b/codex-rs/tui/src/eval_capture_state.rs new file mode 100644 index 00000000000..ba767a393a5 --- /dev/null +++ b/codex-rs/tui/src/eval_capture_state.rs @@ -0,0 +1,40 @@ +use serde::Deserialize; +use serde::Serialize; +use std::path::Path; +use std::path::PathBuf; + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +struct EvalCaptureState { + #[serde(default)] + intro_dismissed: bool, +} + +const STATE_FILENAME: &str = "eval_capture.json"; + +fn state_path(codex_home: &Path) -> PathBuf { + codex_home.join(STATE_FILENAME) +} + +pub(crate) fn should_show_eval_capture_intro(codex_home: &Path) -> bool { + let path = state_path(codex_home); + let Ok(contents) = std::fs::read_to_string(path) else { + return true; + }; + serde_json::from_str::(&contents) + .ok() + .map(|s| !s.intro_dismissed) + .unwrap_or(true) +} + +pub(crate) fn persist_eval_capture_intro_dismissed(codex_home: &Path) -> anyhow::Result<()> { + let path = state_path(codex_home); + let state = EvalCaptureState { + intro_dismissed: true, + }; + let json_line = format!("{}\n", serde_json::to_string(&state)?); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write(path, json_line)?; + Ok(()) +} diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index bce9a350f4b..572735496de 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -45,6 +45,7 @@ mod clipboard_paste; mod color; pub mod custom_terminal; mod diff_render; +mod eval_capture_state; mod exec_cell; mod exec_command; mod external_editor;