Skip to content

Commit 5511a88

Browse files
committed
fix: consolidated UTF-8 safety improvements for string slicing
This PR consolidates the following UTF-8 safety fixes: - #31: Use safe UTF-8 slicing in import command base64 extraction - #32: Use safe UTF-8 slicing for session IDs in notifications - #33: Use char-aware string truncation for UTF-8 safety in resume - #35: Use safe UTF-8 slicing for session IDs in lock command - #37: Validate UTF-8 boundaries in mention parsing All changes ensure safe string operations that respect UTF-8 boundaries: - Replaced direct byte slicing with char-aware methods - Added floor_char_boundary checks before slicing - Prevents panics from slicing multi-byte characters
1 parent c398212 commit 5511a88

File tree

5 files changed

+294
-41
lines changed

5 files changed

+294
-41
lines changed

src/cortex-agents/src/mention.rs

Lines changed: 142 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,46 @@
1717
use regex::Regex;
1818
use std::sync::LazyLock;
1919

20+
/// Safely get the string slice up to the given byte position.
21+
///
22+
/// Returns the slice `&text[..pos]` if `pos` is at a valid UTF-8 character boundary.
23+
/// If `pos` is inside a multi-byte character, finds the nearest valid boundary
24+
/// by searching backwards.
25+
fn safe_slice_up_to(text: &str, pos: usize) -> &str {
26+
if pos >= text.len() {
27+
return text;
28+
}
29+
if text.is_char_boundary(pos) {
30+
return &text[..pos];
31+
}
32+
// Find the nearest valid boundary by searching backwards
33+
let mut valid_pos = pos;
34+
while valid_pos > 0 && !text.is_char_boundary(valid_pos) {
35+
valid_pos -= 1;
36+
}
37+
&text[..valid_pos]
38+
}
39+
40+
/// Safely get the string slice from the given byte position to the end.
41+
///
42+
/// Returns the slice `&text[pos..]` if `pos` is at a valid UTF-8 character boundary.
43+
/// If `pos` is inside a multi-byte character, finds the nearest valid boundary
44+
/// by searching forwards.
45+
fn safe_slice_from(text: &str, pos: usize) -> &str {
46+
if pos >= text.len() {
47+
return "";
48+
}
49+
if text.is_char_boundary(pos) {
50+
return &text[pos..];
51+
}
52+
// Find the nearest valid boundary by searching forwards
53+
let mut valid_pos = pos;
54+
while valid_pos < text.len() && !text.is_char_boundary(valid_pos) {
55+
valid_pos += 1;
56+
}
57+
&text[valid_pos..]
58+
}
59+
2060
/// A parsed agent mention from user input.
2161
#[derive(Debug, Clone, PartialEq, Eq)]
2262
pub struct AgentMention {
@@ -108,10 +148,10 @@ pub fn extract_mention_and_text(
108148
) -> Option<(AgentMention, String)> {
109149
let mention = find_first_valid_mention(text, valid_agents)?;
110150

111-
// Remove the mention from text
151+
// Remove the mention from text, using safe slicing for UTF-8 boundaries
112152
let mut remaining = String::with_capacity(text.len());
113-
remaining.push_str(&text[..mention.start]);
114-
remaining.push_str(&text[mention.end..]);
153+
remaining.push_str(safe_slice_up_to(text, mention.start));
154+
remaining.push_str(safe_slice_from(text, mention.end));
115155

116156
// Trim and normalize whitespace
117157
let remaining = remaining.trim().to_string();
@@ -123,7 +163,8 @@ pub fn extract_mention_and_text(
123163
pub fn starts_with_mention(text: &str, valid_agents: &[&str]) -> bool {
124164
let text = text.trim();
125165
if let Some(mention) = find_first_valid_mention(text, valid_agents) {
126-
mention.start == 0 || text[..mention.start].trim().is_empty()
166+
// Use safe slicing to handle UTF-8 boundaries
167+
mention.start == 0 || safe_slice_up_to(text, mention.start).trim().is_empty()
127168
} else {
128169
false
129170
}
@@ -196,8 +237,8 @@ pub fn parse_message_for_agent(text: &str, valid_agents: &[&str]) -> ParsedAgent
196237

197238
// Check if message starts with @agent
198239
if let Some((mention, remaining)) = extract_mention_and_text(text, valid_agents) {
199-
// Only trigger if mention is at the start
200-
if mention.start == 0 || text[..mention.start].trim().is_empty() {
240+
// Only trigger if mention is at the start, using safe slicing for UTF-8 boundaries
241+
if mention.start == 0 || safe_slice_up_to(text, mention.start).trim().is_empty() {
201242
return ParsedAgentMessage::for_agent(mention.agent_name, remaining, text.to_string());
202243
}
203244
}
@@ -318,4 +359,99 @@ mod tests {
318359
assert_eq!(mentions[0].agent_name, "my-agent");
319360
assert_eq!(mentions[1].agent_name, "my_agent");
320361
}
362+
363+
// UTF-8 boundary safety tests
364+
#[test]
365+
fn test_safe_slice_up_to_ascii() {
366+
let text = "hello world";
367+
assert_eq!(safe_slice_up_to(text, 5), "hello");
368+
assert_eq!(safe_slice_up_to(text, 0), "");
369+
assert_eq!(safe_slice_up_to(text, 100), "hello world");
370+
}
371+
372+
#[test]
373+
fn test_safe_slice_up_to_multibyte() {
374+
// "こんにちは" - each character is 3 bytes
375+
let text = "こんにちは";
376+
assert_eq!(safe_slice_up_to(text, 3), "こ"); // Valid boundary
377+
assert_eq!(safe_slice_up_to(text, 6), "こん"); // Valid boundary
378+
// Position 4 is inside the second character, should return "こ"
379+
assert_eq!(safe_slice_up_to(text, 4), "こ");
380+
assert_eq!(safe_slice_up_to(text, 5), "こ");
381+
}
382+
383+
#[test]
384+
fn test_safe_slice_from_multibyte() {
385+
let text = "こんにちは";
386+
assert_eq!(safe_slice_from(text, 3), "んにちは"); // Valid boundary
387+
// Position 4 is inside second character, should skip to position 6
388+
assert_eq!(safe_slice_from(text, 4), "にちは");
389+
assert_eq!(safe_slice_from(text, 5), "にちは");
390+
}
391+
392+
#[test]
393+
fn test_extract_mention_with_multibyte_prefix() {
394+
let valid = vec!["general"];
395+
396+
// Multi-byte characters before mention
397+
let result = extract_mention_and_text("日本語 @general search files", &valid);
398+
assert!(result.is_some());
399+
let (mention, remaining) = result.unwrap();
400+
assert_eq!(mention.agent_name, "general");
401+
// The prefix should be preserved without panicking
402+
assert!(remaining.contains("search files"));
403+
}
404+
405+
#[test]
406+
fn test_starts_with_mention_multibyte() {
407+
let valid = vec!["general"];
408+
409+
// Whitespace with multi-byte characters should not cause panic
410+
assert!(starts_with_mention(" @general task", &valid));
411+
412+
// Multi-byte characters before mention - should return false, not panic
413+
assert!(!starts_with_mention("日本語 @general task", &valid));
414+
}
415+
416+
#[test]
417+
fn test_parse_message_for_agent_multibyte() {
418+
let valid = vec!["general"];
419+
420+
// Multi-byte prefix - should not panic
421+
let parsed = parse_message_for_agent("日本語 @general find files", &valid);
422+
// Since mention is not at the start, should not invoke task
423+
assert!(!parsed.should_invoke_task);
424+
425+
// Multi-byte in the prompt (after mention)
426+
let parsed = parse_message_for_agent("@general 日本語を検索", &valid);
427+
assert!(parsed.should_invoke_task);
428+
assert_eq!(parsed.agent, Some("general".to_string()));
429+
assert_eq!(parsed.prompt, "日本語を検索");
430+
}
431+
432+
#[test]
433+
fn test_extract_mention_with_emoji() {
434+
let valid = vec!["general"];
435+
436+
// Emojis are 4 bytes each
437+
let result = extract_mention_and_text("🎉 @general celebrate", &valid);
438+
assert!(result.is_some());
439+
let (mention, remaining) = result.unwrap();
440+
assert_eq!(mention.agent_name, "general");
441+
assert!(remaining.contains("celebrate"));
442+
}
443+
444+
#[test]
445+
fn test_mixed_multibyte_and_ascii() {
446+
let valid = vec!["general"];
447+
448+
// Mix of ASCII, CJK, and emoji
449+
let text = "Hello 世界 🌍 @general search for 日本語";
450+
let result = extract_mention_and_text(text, &valid);
451+
assert!(result.is_some());
452+
let (mention, remaining) = result.unwrap();
453+
assert_eq!(mention.agent_name, "general");
454+
// Should not panic and produce valid output
455+
assert!(!remaining.is_empty());
456+
}
321457
}

src/cortex-cli/src/import_cmd.rs

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -357,31 +357,47 @@ fn validate_export_messages(messages: &[ExportMessage]) -> Result<()> {
357357
for (idx, message) in messages.iter().enumerate() {
358358
// Check for base64-encoded image data in content
359359
// Common pattern: "data:image/png;base64,..." or "data:image/jpeg;base64,..."
360-
if let Some(data_uri_start) = message.content.find("data:image/")
361-
&& let Some(base64_marker) = message.content[data_uri_start..].find(";base64,")
362-
{
363-
let base64_start = data_uri_start + base64_marker + 8; // 8 = len(";base64,")
364-
let remaining = &message.content[base64_start..];
365-
366-
// Find end of base64 data (could end with quote, whitespace, or end of string)
367-
let base64_end = remaining
368-
.find(['"', '\'', ' ', '\n', ')'])
369-
.unwrap_or(remaining.len());
370-
let base64_data = &remaining[..base64_end];
371-
372-
// Validate the base64 data
373-
if !base64_data.is_empty() {
374-
let engine = base64::engine::general_purpose::STANDARD;
375-
if let Err(e) = engine.decode(base64_data) {
376-
bail!(
377-
"Invalid base64 encoding in message {} (role: '{}'): {}\n\
378-
The image data starting at position {} has invalid base64 encoding.\n\
379-
Please ensure all embedded images use valid base64 encoding.",
380-
idx + 1,
381-
message.role,
382-
e,
383-
data_uri_start
384-
);
360+
if let Some(data_uri_start) = message.content.find("data:image/") {
361+
// Use safe slicing with .get() to avoid panics on multi-byte UTF-8 boundaries
362+
let content_after_start = match message.content.get(data_uri_start..) {
363+
Some(s) => s,
364+
None => continue, // Invalid byte offset, skip this message
365+
};
366+
367+
if let Some(base64_marker) = content_after_start.find(";base64,") {
368+
let base64_start = data_uri_start + base64_marker + 8; // 8 = len(";base64,")
369+
370+
// Safe slicing for the remaining content after base64 marker
371+
let remaining = match message.content.get(base64_start..) {
372+
Some(s) => s,
373+
None => continue, // Invalid byte offset, skip this message
374+
};
375+
376+
// Find end of base64 data (could end with quote, whitespace, or end of string)
377+
let base64_end = remaining
378+
.find(['"', '\'', ' ', '\n', ')'])
379+
.unwrap_or(remaining.len());
380+
381+
// Safe slicing for the base64 data
382+
let base64_data = match remaining.get(..base64_end) {
383+
Some(s) => s,
384+
None => continue, // Invalid byte offset, skip this message
385+
};
386+
387+
// Validate the base64 data
388+
if !base64_data.is_empty() {
389+
let engine = base64::engine::general_purpose::STANDARD;
390+
if let Err(e) = engine.decode(base64_data) {
391+
bail!(
392+
"Invalid base64 encoding in message {} (role: '{}'): {}\n\
393+
The image data starting at position {} has invalid base64 encoding.\n\
394+
Please ensure all embedded images use valid base64 encoding.",
395+
idx + 1,
396+
message.role,
397+
e,
398+
data_uri_start
399+
);
400+
}
385401
}
386402
}
387403
}
@@ -395,13 +411,24 @@ fn validate_export_messages(messages: &[ExportMessage]) -> Result<()> {
395411
// Try to find and validate any base64 in the arguments
396412
for (pos, _) in args_str.match_indices(";base64,") {
397413
let base64_start = pos + 8;
398-
let remaining = &args_str[base64_start..];
414+
415+
// Safe slicing for the remaining content after base64 marker
416+
let remaining = match args_str.get(base64_start..) {
417+
Some(s) => s,
418+
None => continue, // Invalid byte offset, skip this occurrence
419+
};
420+
399421
let base64_end = remaining
400422
.find(|c: char| {
401423
c == '"' || c == '\'' || c == ' ' || c == '\n' || c == ')'
402424
})
403425
.unwrap_or(remaining.len());
404-
let base64_data = &remaining[..base64_end];
426+
427+
// Safe slicing for the base64 data
428+
let base64_data = match remaining.get(..base64_end) {
429+
Some(s) => s,
430+
None => continue, // Invalid byte offset, skip this occurrence
431+
};
405432

406433
if !base64_data.is_empty() {
407434
let engine = base64::engine::general_purpose::STANDARD;

src/cortex-cli/src/lock_cmd.rs

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,15 @@ fn validate_session_id(session_id: &str) -> Result<()> {
114114
)
115115
}
116116

117+
/// Safely get a string prefix by character count, not byte count.
118+
/// This avoids panics on multi-byte UTF-8 characters.
119+
fn safe_char_prefix(s: &str, max_chars: usize) -> &str {
120+
match s.char_indices().nth(max_chars) {
121+
Some((byte_idx, _)) => &s[..byte_idx],
122+
None => s, // String has fewer than max_chars characters
123+
}
124+
}
125+
117126
/// Get the lock file path.
118127
fn get_lock_file_path() -> PathBuf {
119128
dirs::home_dir()
@@ -156,7 +165,7 @@ pub fn is_session_locked(session_id: &str) -> bool {
156165
match load_lock_file() {
157166
Ok(lock_file) => lock_file.locked_sessions.iter().any(|entry| {
158167
entry.session_id == session_id
159-
|| session_id.starts_with(&entry.session_id[..8.min(entry.session_id.len())])
168+
|| session_id.starts_with(safe_char_prefix(&entry.session_id, 8))
160169
}),
161170
Err(_) => false,
162171
}
@@ -308,7 +317,7 @@ async fn run_list(args: LockListArgs) -> Result<()> {
308317
println!("{}", "-".repeat(60));
309318

310319
for entry in &lock_file.locked_sessions {
311-
let short_id = &entry.session_id[..8.min(entry.session_id.len())];
320+
let short_id = safe_char_prefix(&entry.session_id, 8);
312321
println!(" {} - locked at {}", short_id, entry.locked_at);
313322
if let Some(ref reason) = entry.reason {
314323
println!(" Reason: {}", reason);
@@ -332,7 +341,7 @@ async fn run_check(args: LockCheckArgs) -> Result<()> {
332341
e.session_id == args.session_id
333342
|| args
334343
.session_id
335-
.starts_with(&e.session_id[..8.min(e.session_id.len())])
344+
.starts_with(safe_char_prefix(&e.session_id, 8))
336345
});
337346

338347
if is_locked {
@@ -342,7 +351,7 @@ async fn run_check(args: LockCheckArgs) -> Result<()> {
342351
e.session_id == args.session_id
343352
|| args
344353
.session_id
345-
.starts_with(&e.session_id[..8.min(e.session_id.len())])
354+
.starts_with(safe_char_prefix(&e.session_id, 8))
346355
}) && let Some(ref reason) = entry.reason
347356
{
348357
println!("Reason: {}", reason);
@@ -508,4 +517,39 @@ mod tests {
508517
let path_str = path.to_string_lossy();
509518
assert!(path_str.contains(".cortex"));
510519
}
520+
521+
#[test]
522+
fn test_safe_char_prefix_ascii() {
523+
// ASCII strings should work correctly
524+
assert_eq!(safe_char_prefix("abcdefghij", 8), "abcdefgh");
525+
assert_eq!(safe_char_prefix("abc", 8), "abc");
526+
assert_eq!(safe_char_prefix("", 8), "");
527+
assert_eq!(safe_char_prefix("12345678", 8), "12345678");
528+
}
529+
530+
#[test]
531+
fn test_safe_char_prefix_utf8_multibyte() {
532+
// Multi-byte UTF-8 characters should not panic
533+
// Each emoji is 4 bytes, so 8 chars = 32 bytes
534+
let emoji_id = "🔥🎉🚀💡🌟✨🎯🔮extra";
535+
assert_eq!(safe_char_prefix(emoji_id, 8), "🔥🎉🚀💡🌟✨🎯🔮");
536+
537+
// Mixed ASCII and multi-byte
538+
let mixed = "ab🔥cd🎉ef";
539+
assert_eq!(safe_char_prefix(mixed, 4), "ab🔥c");
540+
assert_eq!(safe_char_prefix(mixed, 8), "ab🔥cd🎉ef");
541+
542+
// Chinese characters (3 bytes each)
543+
let chinese = "中文测试会话标识符";
544+
assert_eq!(safe_char_prefix(chinese, 4), "中文测试");
545+
}
546+
547+
#[test]
548+
fn test_safe_char_prefix_boundary() {
549+
// Edge cases
550+
assert_eq!(safe_char_prefix("a", 0), "");
551+
assert_eq!(safe_char_prefix("a", 1), "a");
552+
assert_eq!(safe_char_prefix("🔥", 1), "🔥");
553+
assert_eq!(safe_char_prefix("🔥", 0), "");
554+
}
511555
}

src/cortex-cli/src/utils/notification.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,14 @@ pub fn send_task_notification(session_id: &str, success: bool) -> Result<()> {
6363
"Cortex Task Failed"
6464
};
6565

66-
let short_id = &session_id[..8.min(session_id.len())];
66+
// Use safe UTF-8 slicing - find the last valid char boundary at or before position 8
67+
let short_id = session_id
68+
.char_indices()
69+
.take_while(|(idx, _)| *idx < 8)
70+
.map(|(idx, ch)| idx + ch.len_utf8())
71+
.last()
72+
.and_then(|end| session_id.get(..end))
73+
.unwrap_or(session_id);
6774
let body = format!("Session: {}", short_id);
6875

6976
let urgency = if success {

0 commit comments

Comments
 (0)