Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions codex-rs/core/src/tools/handlers/view_image.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use async_trait::async_trait;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::openai_models::InputModality;
use serde::Deserialize;
use tokio::fs;

Expand All @@ -18,6 +19,9 @@ use codex_protocol::models::local_image_content_items_with_label_number;

pub struct ViewImageHandler;

const VIEW_IMAGE_UNSUPPORTED_MESSAGE: &str =
"view_image is not allowed because you do not support image inputs";

#[derive(Deserialize)]
struct ViewImageArgs {
path: String,
Expand All @@ -30,6 +34,17 @@ impl ToolHandler for ViewImageHandler {
}

async fn handle(&self, invocation: ToolInvocation) -> Result<ToolOutput, FunctionCallError> {
if !invocation
.turn
.model_info
.input_modalities
.contains(&InputModality::Image)
{
return Err(FunctionCallError::RespondToModel(
VIEW_IMAGE_UNSUPPORTED_MESSAGE.to_string(),
));
}

let ToolInvocation {
session,
turn,
Expand Down
32 changes: 2 additions & 30 deletions codex-rs/core/src/tools/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use codex_protocol::dynamic_tools::DynamicToolSpec;
use codex_protocol::models::VIEW_IMAGE_TOOL_NAME;
use codex_protocol::openai_models::ApplyPatchToolType;
use codex_protocol::openai_models::ConfigShellToolType;
use codex_protocol::openai_models::InputModality;
use codex_protocol::openai_models::ModelInfo;
use serde::Deserialize;
use serde::Serialize;
Expand All @@ -31,7 +30,6 @@ pub(crate) struct ToolsConfig {
pub shell_type: ConfigShellToolType,
pub apply_patch_tool_type: Option<ApplyPatchToolType>,
pub web_search_mode: Option<WebSearchMode>,
pub supports_image_input: bool,
pub search_tool: bool,
pub collab_tools: bool,
pub collaboration_modes_tools: bool,
Expand Down Expand Up @@ -87,7 +85,6 @@ impl ToolsConfig {
shell_type,
apply_patch_tool_type,
web_search_mode: *web_search_mode,
supports_image_input: model_info.input_modalities.contains(&InputModality::Image),
search_tool: include_search_tool,
collab_tools: include_collab_tools,
collaboration_modes_tools: include_collaboration_modes_tools,
Expand Down Expand Up @@ -1498,10 +1495,8 @@ pub(crate) fn build_specs(
Some(WebSearchMode::Disabled) | None => {}
}

if config.supports_image_input {
builder.push_spec_with_parallel_support(create_view_image_tool(), true);
builder.register_handler("view_image", view_image_handler);
}
builder.push_spec_with_parallel_support(create_view_image_tool(), true);
builder.register_handler("view_image", view_image_handler);

if config.collab_tools {
let collab_handler = Arc::new(CollabHandler);
Expand Down Expand Up @@ -2076,29 +2071,6 @@ mod tests {
);
}

#[test]
fn test_non_multimodal_models_exclude_view_image() {
let config = test_config();
let mut model_info = ModelsManager::construct_model_info_offline("gpt-5.1", &config);
model_info.input_modalities = vec![InputModality::Text];
let mut features = Features::with_defaults();
features.enable(Feature::CollaborationModes);
let tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &model_info,
features: &features,
web_search_mode: Some(WebSearchMode::Cached),
});
let (tools, _) = build_specs(&tools_config, Some(HashMap::new()), &[]).build();

assert!(
!tools
.iter()
.map(|t| t.spec.name())
.any(|name| name == VIEW_IMAGE_TOOL_NAME),
"view_image should be excluded for non-multimodal models"
);
}

#[test]
fn test_gpt_5_1_codex_max_unified_exec_web_search() {
let mut features = Features::with_defaults();
Expand Down
127 changes: 127 additions & 0 deletions codex-rs/core/tests/suite/view_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,28 @@

use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_core::CodexAuth;
use codex_core::features::Feature;
use codex_core::protocol::AskForApproval;
use codex_core::protocol::EventMsg;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use codex_protocol::openai_models::ConfigShellToolType;
use codex_protocol::openai_models::InputModality;
use codex_protocol::openai_models::ModelInfo;
use codex_protocol::openai_models::ModelVisibility;
use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ReasoningEffort;
use codex_protocol::openai_models::ReasoningEffortPreset;
use codex_protocol::openai_models::TruncationPolicyConfig;
use codex_protocol::user_input::UserInput;
use core_test_support::responses;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_models_once;
use core_test_support::responses::sse;
use core_test_support::responses::start_mock_server;
use core_test_support::skip_if_no_network;
Expand All @@ -26,6 +37,8 @@ use image::Rgba;
use image::load_from_memory;
use serde_json::Value;
use tokio::time::Duration;
use wiremock::BodyPrintLimit;
use wiremock::MockServer;

fn find_image_message(body: &Value) -> Option<&Value> {
body.get("input")
Expand Down Expand Up @@ -521,6 +534,120 @@ async fn view_image_tool_errors_when_file_missing() -> anyhow::Result<()> {
Ok(())
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_returns_unsupported_message_for_text_only_model() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));

// Use MockServer directly (not start_mock_server) so the first /models request returns our
// text-only model. start_mock_server mounts empty models first, causing get_model_info to
// fall back to model_info_from_slug with default_input_modalities (Text+Image), which would
// incorrectly allow view_image.
let server = MockServer::builder()
.body_print_limit(BodyPrintLimit::Limited(80_000))
.start()
.await;
let model_slug = "text-only-view-image-test-model";
let text_only_model = ModelInfo {
slug: model_slug.to_string(),
display_name: "Text-only view_image test model".to_string(),
description: Some("Remote model for view_image unsupported-path coverage".to_string()),
default_reasoning_level: Some(ReasoningEffort::Medium),
supported_reasoning_levels: vec![ReasoningEffortPreset {
effort: ReasoningEffort::Medium,
description: ReasoningEffort::Medium.to_string(),
}],
shell_type: ConfigShellToolType::ShellCommand,
visibility: ModelVisibility::List,
supported_in_api: true,
input_modalities: vec![InputModality::Text],
priority: 1,
upgrade: None,
base_instructions: "base instructions".to_string(),
model_messages: None,
supports_reasoning_summaries: false,
support_verbosity: false,
default_verbosity: None,
apply_patch_tool_type: None,
truncation_policy: TruncationPolicyConfig::bytes(10_000),
supports_parallel_tool_calls: false,
context_window: Some(272_000),
auto_compact_token_limit: None,
effective_context_window_percent: 95,
experimental_supported_tools: Vec::new(),
};
mount_models_once(
&server,
ModelsResponse {
models: vec![text_only_model],
},
)
.await;

let TestCodex { codex, cwd, .. } = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(|config| {
config.features.enable(Feature::RemoteModels);
config.model = Some(model_slug.to_string());
})
.build(&server)
.await?;

let rel_path = "assets/example.png";
let abs_path = cwd.path().join(rel_path);
if let Some(parent) = abs_path.parent() {
std::fs::create_dir_all(parent)?;
}
let image = ImageBuffer::from_pixel(20, 20, Rgba([255u8, 0, 0, 255]));
image.save(&abs_path)?;

let call_id = "view-image-unsupported-model";
let arguments = serde_json::json!({ "path": rel_path }).to_string();
let first_response = sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "view_image", &arguments),
ev_completed("resp-1"),
]);
responses::mount_sse_once(&server, first_response).await;

let second_response = sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]);
let mock = responses::mount_sse_once(&server, second_response).await;

codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "please attach the image".into(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
cwd: cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::DangerFullAccess,
model: model_slug.to_string(),
effort: None,
summary: ReasoningSummary::Auto,
collaboration_mode: None,
personality: None,
})
.await?;

wait_for_event(&codex, |event| matches!(event, EventMsg::TurnComplete(_))).await;

let output_text = mock
.single_request()
.function_call_output_content_and_success(call_id)
.and_then(|(content, _)| content)
.expect("output text present");
assert_eq!(
output_text,
"view_image is not allowed because you do not support image inputs"
);

Ok(())
}

#[cfg(not(debug_assertions))]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn replaces_invalid_local_image_after_bad_request() -> anyhow::Result<()> {
Expand Down
Loading