Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions codex-rs/core/src/codex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,24 @@ impl Session {
}
}

/// Returns the input if there was no task running to inject into
pub async fn inject_response_items(
&self,
input: Vec<ResponseInputItem>,
) -> Result<(), Vec<ResponseInputItem>> {
let mut active = self.active_turn.lock().await;
match active.as_mut() {
Some(at) => {
let mut ts = at.turn_state.lock().await;
for item in input {
ts.push_pending_input(item);
}
Ok(())
}
None => Err(input),
}
}

pub async fn get_pending_input(&self) -> Vec<ResponseInputItem> {
let mut active = self.active_turn.lock().await;
match active.as_mut() {
Expand Down
49 changes: 48 additions & 1 deletion codex-rs/core/src/event_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use codex_protocol::models::ReasoningItemContent;
use codex_protocol::models::ReasoningItemReasoningSummary;
use codex_protocol::models::ResponseItem;
use codex_protocol::models::WebSearchAction;
use codex_protocol::models::is_local_image_close_tag_text;
use codex_protocol::models::is_local_image_open_tag_text;
use codex_protocol::user_input::UserInput;
use tracing::warn;
use uuid::Uuid;
Expand All @@ -32,9 +34,17 @@ fn parse_user_message(message: &[ContentItem]) -> Option<UserMessageItem> {

let mut content: Vec<UserInput> = Vec::new();

for content_item in message.iter() {
for (idx, content_item) in message.iter().enumerate() {
match content_item {
ContentItem::InputText { text } => {
if is_local_image_open_tag_text(text)
&& (matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. })))
|| (idx > 0
&& is_local_image_close_tag_text(text)
&& matches!(message.get(idx - 1), Some(ContentItem::InputImage { .. })))
{
continue;
}
if is_session_prefix(text) || is_user_shell_command_text(text) {
return None;
}
Expand Down Expand Up @@ -177,6 +187,43 @@ mod tests {
}
}

#[test]
fn skips_local_image_label_text() {
let image_url = "".to_string();
let label = codex_protocol::models::local_image_open_tag_text(1);
let user_text = "Please review this image.".to_string();

let item = ResponseItem::Message {
id: None,
role: "user".to_string(),
content: vec![
ContentItem::InputText { text: label },
ContentItem::InputImage {
image_url: image_url.clone(),
},
ContentItem::InputText {
text: "</image>".to_string(),
},
ContentItem::InputText {
text: user_text.clone(),
},
],
};

let turn_item = parse_turn_item(&item).expect("expected user message turn item");

match turn_item {
TurnItem::UserMessage(user) => {
let expected_content = vec![
UserInput::Image { image_url },
UserInput::Text { text: user_text },
];
assert_eq!(user.content, expected_content);
}
other => panic!("expected TurnItem::UserMessage, got {other:?}"),
}
}

#[test]
fn skips_user_instructions_and_env() {
let items = vec![
Expand Down
12 changes: 10 additions & 2 deletions codex-rs/core/src/tools/handlers/view_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ use crate::tools::context::ToolPayload;
use crate::tools::handlers::parse_arguments;
use crate::tools::registry::ToolHandler;
use crate::tools::registry::ToolKind;
use codex_protocol::user_input::UserInput;
use codex_protocol::models::ContentItem;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::models::local_image_content_items;

pub struct ViewImageHandler;

Expand Down Expand Up @@ -63,8 +65,14 @@ impl ToolHandler for ViewImageHandler {
}
let event_path = abs_path.clone();

let content: Vec<ContentItem> = local_image_content_items(&abs_path, false);
let input = ResponseInputItem::Message {
role: "user".to_string(),
content,
};

session
.inject_input(vec![UserInput::LocalImage { path: abs_path }])
.inject_response_items(vec![input])
.await
.map_err(|_| {
FunctionCallError::RespondToModel(
Expand Down
8 changes: 4 additions & 4 deletions codex-rs/core/src/tools/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::tools::handlers::PLAN_TOOL;
use crate::tools::handlers::apply_patch::create_apply_patch_freeform_tool;
use crate::tools::handlers::apply_patch::create_apply_patch_json_tool;
use crate::tools::registry::ToolRegistryBuilder;
use codex_protocol::models::VIEW_IMAGE_TOOL_NAME;
use codex_protocol::openai_models::ApplyPatchToolType;
use codex_protocol::openai_models::ConfigShellToolType;
use codex_protocol::openai_models::ModelInfo;
Expand Down Expand Up @@ -403,10 +404,9 @@ fn create_view_image_tool() -> ToolSpec {
);

ToolSpec::Function(ResponsesApiTool {
name: "view_image".to_string(),
description:
"Attach a local image (by filesystem path) to the thread context for this turn."
.to_string(),
name: VIEW_IMAGE_TOOL_NAME.to_string(),
description: "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within <image ...> tags)."
.to_string(),
strict: false,
parameters: JsonSchema::Object {
properties,
Expand Down
116 changes: 85 additions & 31 deletions codex-rs/protocol/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,30 @@ fn local_image_error_placeholder(
}
}

pub const VIEW_IMAGE_TOOL_NAME: &str = "view_image";

const LOCAL_IMAGE_OPEN_TAG_PREFIX: &str = "<image name=";
const LOCAL_IMAGE_OPEN_TAG_SUFFIX: &str = ">";
const LOCAL_IMAGE_CLOSE_TAG: &str = "</image>";

pub fn local_image_label_text(label_number: usize) -> String {
format!("[Image #{label_number}]")
}

pub fn local_image_open_tag_text(label_number: usize) -> String {
let label = local_image_label_text(label_number);
format!("{LOCAL_IMAGE_OPEN_TAG_PREFIX}{label}{LOCAL_IMAGE_OPEN_TAG_SUFFIX}")
}

pub fn is_local_image_open_tag_text(text: &str) -> bool {
text.strip_prefix(LOCAL_IMAGE_OPEN_TAG_PREFIX)
.is_some_and(|rest| rest.ends_with(LOCAL_IMAGE_OPEN_TAG_SUFFIX))
}

pub fn is_local_image_close_tag_text(text: &str) -> bool {
text == LOCAL_IMAGE_CLOSE_TAG
}

fn invalid_image_error_placeholder(
path: &std::path::Path,
error: impl std::fmt::Display,
Expand All @@ -203,6 +227,58 @@ fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> Co
}
}

pub fn local_image_content_items(path: &std::path::Path, include_label: bool) -> Vec<ContentItem> {
local_image_content_items_with_label_number(path, include_label, 1)
}

fn local_image_content_items_with_label_number(
path: &std::path::Path,
include_label: bool,
label_number: usize,
) -> Vec<ContentItem> {
match load_and_resize_to_fit(path) {
Ok(image) => {
let mut items = Vec::with_capacity(3);
if include_label {
items.push(ContentItem::InputText {
text: local_image_open_tag_text(label_number),
});
}
items.push(ContentItem::InputImage {
image_url: image.into_data_url(),
});
if include_label {
items.push(ContentItem::InputText {
text: LOCAL_IMAGE_CLOSE_TAG.to_string(),
});
}
items
}
Err(err) => {
if matches!(&err, ImageProcessingError::Read { .. }) {
vec![local_image_error_placeholder(path, &err)]
} else if err.is_invalid_image() {
vec![invalid_image_error_placeholder(path, &err)]
} else {
let Some(mime_guess) = mime_guess::from_path(path).first() else {
return vec![local_image_error_placeholder(
path,
"unsupported MIME type (unknown)",
)];
};
let mime = mime_guess.essence_str().to_owned();
if !mime.starts_with("image/") {
return vec![local_image_error_placeholder(
path,
format!("unsupported MIME type `{mime}`"),
)];
}
vec![unsupported_image_error_placeholder(path, &mime)]
}
}
}
}

impl From<ResponseInputItem> for ResponseItem {
fn from(item: ResponseInputItem) -> Self {
match item {
Expand Down Expand Up @@ -296,41 +372,19 @@ pub enum ReasoningItemContent {

impl From<Vec<UserInput>> for ResponseInputItem {
fn from(items: Vec<UserInput>) -> Self {
let mut image_index = 0;
Self::Message {
role: "user".to_string(),
content: items
.into_iter()
.filter_map(|c| match c {
UserInput::Text { text } => Some(ContentItem::InputText { text }),
UserInput::Image { image_url } => Some(ContentItem::InputImage { image_url }),
UserInput::LocalImage { path } => match load_and_resize_to_fit(&path) {
Ok(image) => Some(ContentItem::InputImage {
image_url: image.into_data_url(),
}),
Err(err) => {
if matches!(&err, ImageProcessingError::Read { .. }) {
Some(local_image_error_placeholder(&path, &err))
} else if err.is_invalid_image() {
Some(invalid_image_error_placeholder(&path, &err))
} else {
let Some(mime_guess) = mime_guess::from_path(&path).first() else {
return Some(local_image_error_placeholder(
&path,
"unsupported MIME type (unknown)",
));
};
let mime = mime_guess.essence_str().to_owned();
if !mime.starts_with("image/") {
return Some(local_image_error_placeholder(
&path,
format!("unsupported MIME type `{mime}`"),
));
}
Some(unsupported_image_error_placeholder(&path, &mime))
}
}
},
UserInput::Skill { .. } => None, // Skill bodies are injected later in core
.flat_map(|c| match c {
UserInput::Text { text } => vec![ContentItem::InputText { text }],
UserInput::Image { image_url } => vec![ContentItem::InputImage { image_url }],
UserInput::LocalImage { path } => {
image_index += 1;
local_image_content_items_with_label_number(&path, true, image_index)
}
UserInput::Skill { .. } => Vec::new(), // Skill bodies are injected later in core
})
.collect::<Vec<ContentItem>>(),
}
Expand Down
Loading
Loading