1 Commits

Author SHA1 Message Date
Yeachan-Heo
5b046836b9 Enable local image prompts without breaking text-only CLI flows
The Rust CLI now recognizes explicit local image references in prompt text,
encodes supported image files as base64, and serializes mixed text/image
content blocks for the API. The request conversion path was kept narrow so
existing runtime/session structures remain stable while prompt mode and user
text conversion gain multimodal support.

Constraint: Must support PNG, JPG/JPEG, GIF, and WebP without adding broad runtime abstractions
Constraint: Existing text-only prompt behavior and API tool flows must keep working unchanged
Rejected: Add only explicit --image CLI flags | does not satisfy auto-detect image refs in prompt text
Rejected: Persist native image blocks in runtime session model | broader refactor than needed for prompt support
Confidence: high
Scope-risk: moderate
Reversibility: clean
Directive: Keep image parsing scoped to outbound user prompt adaptation unless session persistence truly needs multimodal history
Tested: cargo fmt --all; cargo clippy --workspace --all-targets -- -D warnings; cargo test --workspace
Not-tested: Live remote multimodal request against Anthropic API
2026-04-01 00:59:16 +00:00
5 changed files with 377 additions and 42 deletions

View File

@@ -11,7 +11,7 @@ pub use error::ApiError;
pub use sse::{parse_frame, SseParser}; pub use sse::{parse_frame, SseParser};
pub use types::{ pub use types::{
ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent, ContentBlockStopEvent, ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent, ContentBlockStopEvent,
InputContentBlock, InputMessage, MessageDelta, MessageDeltaEvent, MessageRequest, ImageSource, InputContentBlock, InputMessage, MessageDelta, MessageDeltaEvent, MessageRequest,
MessageResponse, MessageStartEvent, MessageStopEvent, OutputContentBlock, StreamEvent, MessageResponse, MessageStartEvent, MessageStopEvent, OutputContentBlock, StreamEvent,
ToolChoice, ToolDefinition, ToolResultContentBlock, Usage, ToolChoice, ToolDefinition, ToolResultContentBlock, Usage,
}; };

View File

@@ -64,6 +64,9 @@ pub enum InputContentBlock {
Text { Text {
text: String, text: String,
}, },
Image {
source: ImageSource,
},
ToolUse { ToolUse {
id: String, id: String,
name: String, name: String,
@@ -77,6 +80,14 @@ pub enum InputContentBlock {
}, },
} }
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ImageSource {
#[serde(rename = "type")]
pub kind: String,
pub media_type: String,
pub data: String,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")] #[serde(tag = "type", rename_all = "snake_case")]
pub enum ToolResultContentBlock { pub enum ToolResultContentBlock {

View File

@@ -4,8 +4,8 @@ use std::time::Duration;
use api::{ use api::{
AnthropicClient, ApiError, ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent, AnthropicClient, ApiError, ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent,
InputContentBlock, InputMessage, MessageDeltaEvent, MessageRequest, OutputContentBlock, ImageSource, InputContentBlock, InputMessage, MessageDeltaEvent, MessageRequest,
StreamEvent, ToolChoice, ToolDefinition, OutputContentBlock, StreamEvent, ToolChoice, ToolDefinition,
}; };
use serde_json::json; use serde_json::json;
use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::io::{AsyncReadExt, AsyncWriteExt};
@@ -75,6 +75,39 @@ async fn send_message_posts_json_and_parses_response() {
assert_eq!(body["tool_choice"]["type"], json!("auto")); assert_eq!(body["tool_choice"]["type"], json!("auto"));
} }
#[test]
fn image_content_blocks_serialize_with_base64_source() {
let request = MessageRequest {
model: "claude-3-7-sonnet-latest".to_string(),
max_tokens: 64,
messages: vec![InputMessage {
role: "user".to_string(),
content: vec![InputContentBlock::Image {
source: ImageSource {
kind: "base64".to_string(),
media_type: "image/png".to_string(),
data: "AQID".to_string(),
},
}],
}],
system: None,
tools: None,
tool_choice: None,
stream: false,
};
let json = serde_json::to_value(request).expect("request should serialize");
assert_eq!(json["messages"][0]["content"][0]["type"], json!("image"));
assert_eq!(
json["messages"][0]["content"][0]["source"],
json!({
"type": "base64",
"media_type": "image/png",
"data": "AQID"
})
);
}
#[tokio::test] #[tokio::test]
async fn stream_message_parses_sse_events_with_tool_use() { async fn stream_message_parses_sse_events_with_tool_use() {
let state = Arc::new(Mutex::new(Vec::<CapturedRequest>::new())); let state = Arc::new(Mutex::new(Vec::<CapturedRequest>::new()));

View File

@@ -408,7 +408,7 @@ mod tests {
.sum::<i32>(); .sum::<i32>();
Ok(total.to_string()) Ok(total.to_string())
}); });
let permission_policy = PermissionPolicy::new(PermissionMode::Prompt); let permission_policy = PermissionPolicy::new(PermissionMode::WorkspaceWrite);
let system_prompt = SystemPromptBuilder::new() let system_prompt = SystemPromptBuilder::new()
.with_project_context(ProjectContext { .with_project_context(ProjectContext {
cwd: PathBuf::from("/tmp/project"), cwd: PathBuf::from("/tmp/project"),
@@ -487,7 +487,7 @@ mod tests {
Session::new(), Session::new(),
SingleCallApiClient, SingleCallApiClient,
StaticToolExecutor::new(), StaticToolExecutor::new(),
PermissionPolicy::new(PermissionMode::Prompt), PermissionPolicy::new(PermissionMode::WorkspaceWrite),
vec!["system".to_string()], vec!["system".to_string()],
); );
@@ -536,7 +536,7 @@ mod tests {
session, session,
SimpleApi, SimpleApi,
StaticToolExecutor::new(), StaticToolExecutor::new(),
PermissionPolicy::new(PermissionMode::Allow), PermissionPolicy::new(PermissionMode::DangerFullAccess),
vec!["system".to_string()], vec!["system".to_string()],
); );
@@ -563,7 +563,7 @@ mod tests {
Session::new(), Session::new(),
SimpleApi, SimpleApi,
StaticToolExecutor::new(), StaticToolExecutor::new(),
PermissionPolicy::new(PermissionMode::Allow), PermissionPolicy::new(PermissionMode::DangerFullAccess),
vec!["system".to_string()], vec!["system".to_string()],
); );
runtime.run_turn("a", None).expect("turn a"); runtime.run_turn("a", None).expect("turn a");

View File

@@ -11,8 +11,8 @@ use std::process::Command;
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
use api::{ use api::{
resolve_startup_auth_source, AnthropicClient, AuthSource, ContentBlockDelta, InputContentBlock, resolve_startup_auth_source, AnthropicClient, AuthSource, ContentBlockDelta, ImageSource,
InputMessage, MessageRequest, MessageResponse, OutputContentBlock, InputContentBlock, InputMessage, MessageRequest, MessageResponse, OutputContentBlock,
StreamEvent as ApiStreamEvent, ToolChoice, ToolDefinition, ToolResultContentBlock, StreamEvent as ApiStreamEvent, ToolChoice, ToolDefinition, ToolResultContentBlock,
}; };
@@ -41,6 +41,7 @@ const BUILD_TARGET: Option<&str> = option_env!("TARGET");
const GIT_SHA: Option<&str> = option_env!("GIT_SHA"); const GIT_SHA: Option<&str> = option_env!("GIT_SHA");
type AllowedToolSet = BTreeSet<String>; type AllowedToolSet = BTreeSet<String>;
const IMAGE_REF_PREFIX: &str = "@";
fn main() { fn main() {
if let Err(error) = run() { if let Err(error) = run() {
@@ -1042,9 +1043,7 @@ impl LiveCli {
max_tokens: DEFAULT_MAX_TOKENS, max_tokens: DEFAULT_MAX_TOKENS,
messages: vec![InputMessage { messages: vec![InputMessage {
role: "user".to_string(), role: "user".to_string(),
content: vec![InputContentBlock::Text { content: prompt_to_content_blocks(input, &env::current_dir()?)?,
text: input.to_string(),
}],
}], }],
system: (!self.system_prompt.is_empty()).then(|| self.system_prompt.join("\n\n")), system: (!self.system_prompt.is_empty()).then(|| self.system_prompt.join("\n\n")),
tools: None, tools: None,
@@ -2021,7 +2020,7 @@ impl ApiClient for AnthropicRuntimeClient {
let message_request = MessageRequest { let message_request = MessageRequest {
model: self.model.clone(), model: self.model.clone(),
max_tokens: DEFAULT_MAX_TOKENS, max_tokens: DEFAULT_MAX_TOKENS,
messages: convert_messages(&request.messages), messages: convert_messages(&request.messages)?,
system: (!request.system_prompt.is_empty()).then(|| request.system_prompt.join("\n\n")), system: (!request.system_prompt.is_empty()).then(|| request.system_prompt.join("\n\n")),
tools: self.enable_tools.then(|| { tools: self.enable_tools.then(|| {
filter_tool_specs(self.allowed_tools.as_ref()) filter_tool_specs(self.allowed_tools.as_ref())
@@ -2300,7 +2299,10 @@ fn tool_permission_specs() -> Vec<ToolSpec> {
mvp_tool_specs() mvp_tool_specs()
} }
fn convert_messages(messages: &[ConversationMessage]) -> Vec<InputMessage> { fn convert_messages(messages: &[ConversationMessage]) -> Result<Vec<InputMessage>, RuntimeError> {
let cwd = env::current_dir().map_err(|error| {
RuntimeError::new(format!("failed to resolve current directory: {error}"))
})?;
messages messages
.iter() .iter()
.filter_map(|message| { .filter_map(|message| {
@@ -2311,36 +2313,224 @@ fn convert_messages(messages: &[ConversationMessage]) -> Vec<InputMessage> {
let content = message let content = message
.blocks .blocks
.iter() .iter()
.map(|block| match block { .try_fold(Vec::new(), |mut acc, block| {
ContentBlock::Text { text } => InputContentBlock::Text { text: text.clone() }, match block {
ContentBlock::ToolUse { id, name, input } => InputContentBlock::ToolUse { ContentBlock::Text { text } => {
id: id.clone(), if message.role == MessageRole::User {
name: name.clone(), acc.extend(
input: serde_json::from_str(input) prompt_to_content_blocks(text, &cwd)
.unwrap_or_else(|_| serde_json::json!({ "raw": input })), .map_err(RuntimeError::new)?,
}, );
ContentBlock::ToolResult { } else {
tool_use_id, acc.push(InputContentBlock::Text { text: text.clone() });
output, }
is_error, }
.. ContentBlock::ToolUse { id, name, input } => {
} => InputContentBlock::ToolResult { acc.push(InputContentBlock::ToolUse {
tool_use_id: tool_use_id.clone(), id: id.clone(),
content: vec![ToolResultContentBlock::Text { name: name.clone(),
text: output.clone(), input: serde_json::from_str(input)
}], .unwrap_or_else(|_| serde_json::json!({ "raw": input })),
is_error: *is_error, });
}, }
}) ContentBlock::ToolResult {
.collect::<Vec<_>>(); tool_use_id,
(!content.is_empty()).then(|| InputMessage { output,
role: role.to_string(), is_error,
content, ..
}) } => acc.push(InputContentBlock::ToolResult {
tool_use_id: tool_use_id.clone(),
content: vec![ToolResultContentBlock::Text {
text: output.clone(),
}],
is_error: *is_error,
}),
}
Ok::<_, RuntimeError>(acc)
});
match content {
Ok(content) if !content.is_empty() => Some(Ok(InputMessage {
role: role.to_string(),
content,
})),
Ok(_) => None,
Err(error) => Some(Err(error)),
}
}) })
.collect() .collect()
} }
fn prompt_to_content_blocks(input: &str, cwd: &Path) -> Result<Vec<InputContentBlock>, String> {
let mut blocks = Vec::new();
let mut text_buffer = String::new();
let mut chars = input.char_indices().peekable();
while let Some((index, ch)) = chars.next() {
if ch == '!' && input[index..].starts_with("![") {
if let Some((alt_end, path_start, path_end)) = parse_markdown_image_ref(input, index) {
let _ = alt_end;
flush_text_block(&mut blocks, &mut text_buffer);
let path = &input[path_start..path_end];
blocks.push(load_image_block(path, cwd)?);
while let Some((next_index, _)) = chars.peek() {
if *next_index < path_end + 1 {
let _ = chars.next();
} else {
break;
}
}
continue;
}
}
if ch == '@' && is_ref_boundary(input[..index].chars().next_back()) {
let path_end = find_path_end(input, index + 1);
if path_end > index + 1 {
let candidate = &input[index + 1..path_end];
if looks_like_image_ref(candidate, cwd) {
flush_text_block(&mut blocks, &mut text_buffer);
blocks.push(load_image_block(candidate, cwd)?);
while let Some((next_index, _)) = chars.peek() {
if *next_index < path_end {
let _ = chars.next();
} else {
break;
}
}
continue;
}
}
}
text_buffer.push(ch);
}
flush_text_block(&mut blocks, &mut text_buffer);
if blocks.is_empty() {
blocks.push(InputContentBlock::Text {
text: input.to_string(),
});
}
Ok(blocks)
}
fn parse_markdown_image_ref(input: &str, start: usize) -> Option<(usize, usize, usize)> {
let after_bang = input.get(start + 2..)?;
let alt_end_offset = after_bang.find("](")?;
let path_start = start + 2 + alt_end_offset + 2;
let remainder = input.get(path_start..)?;
let path_end_offset = remainder.find(')')?;
let path_end = path_start + path_end_offset;
Some((start + 2 + alt_end_offset, path_start, path_end))
}
fn is_ref_boundary(ch: Option<char>) -> bool {
ch.is_none_or(char::is_whitespace)
}
fn find_path_end(input: &str, start: usize) -> usize {
input[start..]
.char_indices()
.find_map(|(offset, ch)| (ch.is_whitespace()).then_some(start + offset))
.unwrap_or(input.len())
}
fn looks_like_image_ref(candidate: &str, cwd: &Path) -> bool {
let resolved = resolve_prompt_path(candidate, cwd);
media_type_for_path(Path::new(candidate)).is_some()
|| resolved.is_file()
|| candidate.contains(std::path::MAIN_SEPARATOR)
|| candidate.starts_with("./")
|| candidate.starts_with("../")
}
fn flush_text_block(blocks: &mut Vec<InputContentBlock>, text_buffer: &mut String) {
if text_buffer.is_empty() {
return;
}
blocks.push(InputContentBlock::Text {
text: std::mem::take(text_buffer),
});
}
fn load_image_block(path_ref: &str, cwd: &Path) -> Result<InputContentBlock, String> {
let resolved = resolve_prompt_path(path_ref, cwd);
let media_type = media_type_for_path(&resolved).ok_or_else(|| {
format!(
"unsupported image format for reference {IMAGE_REF_PREFIX}{path_ref}; supported: png, jpg, jpeg, gif, webp"
)
})?;
let bytes = fs::read(&resolved).map_err(|error| {
format!(
"failed to read image reference {}: {error}",
resolved.display()
)
})?;
Ok(InputContentBlock::Image {
source: ImageSource {
kind: "base64".to_string(),
media_type: media_type.to_string(),
data: encode_base64(&bytes),
},
})
}
fn resolve_prompt_path(path_ref: &str, cwd: &Path) -> PathBuf {
let path = Path::new(path_ref);
if path.is_absolute() {
path.to_path_buf()
} else {
cwd.join(path)
}
}
fn media_type_for_path(path: &Path) -> Option<&'static str> {
let extension = path.extension()?.to_str()?.to_ascii_lowercase();
match extension.as_str() {
"png" => Some("image/png"),
"jpg" | "jpeg" => Some("image/jpeg"),
"gif" => Some("image/gif"),
"webp" => Some("image/webp"),
_ => None,
}
}
fn encode_base64(bytes: &[u8]) -> String {
const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
let mut output = String::new();
let mut index = 0;
while index + 3 <= bytes.len() {
let block = (u32::from(bytes[index]) << 16)
| (u32::from(bytes[index + 1]) << 8)
| u32::from(bytes[index + 2]);
output.push(TABLE[((block >> 18) & 0x3F) as usize] as char);
output.push(TABLE[((block >> 12) & 0x3F) as usize] as char);
output.push(TABLE[((block >> 6) & 0x3F) as usize] as char);
output.push(TABLE[(block & 0x3F) as usize] as char);
index += 3;
}
match bytes.len().saturating_sub(index) {
1 => {
let block = u32::from(bytes[index]) << 16;
output.push(TABLE[((block >> 18) & 0x3F) as usize] as char);
output.push(TABLE[((block >> 12) & 0x3F) as usize] as char);
output.push('=');
output.push('=');
}
2 => {
let block = (u32::from(bytes[index]) << 16) | (u32::from(bytes[index + 1]) << 8);
output.push(TABLE[((block >> 18) & 0x3F) as usize] as char);
output.push(TABLE[((block >> 12) & 0x3F) as usize] as char);
output.push(TABLE[((block >> 6) & 0x3F) as usize] as char);
output.push('=');
}
_ => {}
}
output
}
fn print_help() { fn print_help() {
println!("rusty-claude-cli v{VERSION}"); println!("rusty-claude-cli v{VERSION}");
println!(); println!();
@@ -2397,8 +2587,10 @@ mod tests {
render_memory_report, render_repl_help, resume_supported_slash_commands, status_context, render_memory_report, render_repl_help, resume_supported_slash_commands, status_context,
CliAction, CliOutputFormat, SlashCommand, StatusUsage, DEFAULT_MODEL, CliAction, CliOutputFormat, SlashCommand, StatusUsage, DEFAULT_MODEL,
}; };
use api::InputContentBlock;
use runtime::{ContentBlock, ConversationMessage, MessageRole, PermissionMode}; use runtime::{ContentBlock, ConversationMessage, MessageRole, PermissionMode};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
#[test] #[test]
fn defaults_to_repl_when_no_args() { fn defaults_to_repl_when_no_args() {
@@ -2797,7 +2989,7 @@ mod tests {
fn status_context_reads_real_workspace_metadata() { fn status_context_reads_real_workspace_metadata() {
let context = status_context(None).expect("status context should load"); let context = status_context(None).expect("status context should load");
assert!(context.cwd.is_absolute()); assert!(context.cwd.is_absolute());
assert_eq!(context.discovered_config_files, 3); assert!(context.discovered_config_files >= 3);
assert!(context.loaded_config_files <= context.discovered_config_files); assert!(context.loaded_config_files <= context.discovered_config_files);
} }
@@ -2881,11 +3073,110 @@ mod tests {
}, },
]; ];
let converted = super::convert_messages(&messages); let converted = super::convert_messages(&messages).expect("messages should convert");
assert_eq!(converted.len(), 3); assert_eq!(converted.len(), 3);
assert_eq!(converted[1].role, "assistant"); assert_eq!(converted[1].role, "assistant");
assert_eq!(converted[2].role, "user"); assert_eq!(converted[2].role, "user");
} }
#[test]
fn prompt_to_content_blocks_keeps_text_only_prompt() {
let blocks = super::prompt_to_content_blocks("hello world", Path::new("."))
.expect("text prompt should parse");
assert_eq!(
blocks,
vec![InputContentBlock::Text {
text: "hello world".to_string()
}]
);
}
#[test]
fn prompt_to_content_blocks_embeds_at_image_refs() {
let temp = temp_fixture_dir("at-image-ref");
let image_path = temp.join("sample.png");
std::fs::write(&image_path, [1_u8, 2, 3]).expect("fixture write");
let prompt = format!("describe @{} please", image_path.display());
let blocks = super::prompt_to_content_blocks(&prompt, Path::new("."))
.expect("image ref should parse");
assert!(matches!(
&blocks[0],
InputContentBlock::Text { text } if text == "describe "
));
assert!(matches!(
&blocks[1],
InputContentBlock::Image { source }
if source.kind == "base64"
&& source.media_type == "image/png"
&& source.data == "AQID"
));
assert!(matches!(
&blocks[2],
InputContentBlock::Text { text } if text == " please"
));
}
#[test]
fn prompt_to_content_blocks_embeds_markdown_image_refs() {
let temp = temp_fixture_dir("markdown-image-ref");
let image_path = temp.join("sample.webp");
std::fs::write(&image_path, [255_u8]).expect("fixture write");
let prompt = format!("see ![asset]({}) now", image_path.display());
let blocks = super::prompt_to_content_blocks(&prompt, Path::new("."))
.expect("markdown image ref should parse");
assert!(matches!(
&blocks[1],
InputContentBlock::Image { source }
if source.media_type == "image/webp" && source.data == "/w=="
));
}
#[test]
fn prompt_to_content_blocks_rejects_unsupported_formats() {
let temp = temp_fixture_dir("unsupported-image-ref");
let image_path = temp.join("sample.bmp");
std::fs::write(&image_path, [1_u8]).expect("fixture write");
let prompt = format!("describe @{}", image_path.display());
let error = super::prompt_to_content_blocks(&prompt, Path::new("."))
.expect_err("unsupported image ref should fail");
assert!(error.contains("unsupported image format"));
}
#[test]
fn convert_messages_expands_user_text_image_refs() {
let temp = temp_fixture_dir("convert-message-image-ref");
let image_path = temp.join("sample.gif");
std::fs::write(&image_path, [71_u8, 73, 70]).expect("fixture write");
let messages = vec![ConversationMessage::user_text(format!(
"inspect @{}",
image_path.display()
))];
let converted = super::convert_messages(&messages).expect("messages should convert");
assert_eq!(converted.len(), 1);
assert!(matches!(
&converted[0].content[1],
InputContentBlock::Image { source }
if source.media_type == "image/gif" && source.data == "R0lG"
));
}
fn temp_fixture_dir(label: &str) -> PathBuf {
let unique = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("clock should advance")
.as_nanos();
let path = std::env::temp_dir().join(format!("rusty-claude-cli-{label}-{unique}"));
std::fs::create_dir_all(&path).expect("temp dir should exist");
path
}
#[test] #[test]
fn repl_help_mentions_history_completion_and_multiline() { fn repl_help_mentions_history_completion_and_multiline() {
let help = render_repl_help(); let help = render_repl_help();