Skip to content

Commit c60343a

Browse files
authored
eval: Port to agent2 (#40704)
Release Notes: - N/A
1 parent 4a93719 commit c60343a

22 files changed

+783
-456
lines changed

Cargo.lock

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ members = [
5858
"crates/edit_prediction_context",
5959
"crates/zeta2_tools",
6060
"crates/editor",
61-
# "crates/eval",
61+
"crates/eval",
6262
"crates/explorer_command_injector",
6363
"crates/extension",
6464
"crates/extension_api",

crates/acp_thread/src/terminal.rs

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
use agent_client_protocol as acp;
2-
2+
use anyhow::Result;
33
use futures::{FutureExt as _, future::Shared};
4-
use gpui::{App, AppContext, Context, Entity, Task};
4+
use gpui::{App, AppContext, AsyncApp, Context, Entity, Task};
55
use language::LanguageRegistry;
66
use markdown::Markdown;
7+
use project::Project;
8+
use settings::{Settings as _, SettingsLocation};
79
use std::{path::PathBuf, process::ExitStatus, sync::Arc, time::Instant};
10+
use task::Shell;
11+
use terminal::terminal_settings::TerminalSettings;
12+
use util::get_default_system_shell_preferring_bash;
813

914
pub struct Terminal {
1015
id: acp::TerminalId,
@@ -170,3 +175,68 @@ impl Terminal {
170175
)
171176
}
172177
}
178+
179+
pub async fn create_terminal_entity(
180+
command: String,
181+
args: &[String],
182+
env_vars: Vec<(String, String)>,
183+
cwd: Option<PathBuf>,
184+
project: &Entity<Project>,
185+
cx: &mut AsyncApp,
186+
) -> Result<Entity<terminal::Terminal>> {
187+
let mut env = if let Some(dir) = &cwd {
188+
project
189+
.update(cx, |project, cx| {
190+
let worktree = project.find_worktree(dir.as_path(), cx);
191+
let shell = TerminalSettings::get(
192+
worktree.as_ref().map(|(worktree, path)| SettingsLocation {
193+
worktree_id: worktree.read(cx).id(),
194+
path: &path,
195+
}),
196+
cx,
197+
)
198+
.shell
199+
.clone();
200+
project.directory_environment(&shell, dir.clone().into(), cx)
201+
})?
202+
.await
203+
.unwrap_or_default()
204+
} else {
205+
Default::default()
206+
};
207+
208+
// Disables paging for `git` and hopefully other commands
209+
env.insert("PAGER".into(), "".into());
210+
env.extend(env_vars);
211+
212+
// Use remote shell or default system shell, as appropriate
213+
let shell = project
214+
.update(cx, |project, cx| {
215+
project
216+
.remote_client()
217+
.and_then(|r| r.read(cx).default_system_shell())
218+
.map(Shell::Program)
219+
})?
220+
.unwrap_or_else(|| Shell::Program(get_default_system_shell_preferring_bash()));
221+
let is_windows = project
222+
.read_with(cx, |project, cx| project.path_style(cx).is_windows())
223+
.unwrap_or(cfg!(windows));
224+
let (task_command, task_args) = task::ShellBuilder::new(&shell, is_windows)
225+
.redirect_stdin_to_dev_null()
226+
.build(Some(command.clone()), &args);
227+
228+
project
229+
.update(cx, |project, cx| {
230+
project.create_terminal_task(
231+
task::SpawnInTerminal {
232+
command: Some(task_command),
233+
args: task_args,
234+
cwd,
235+
env,
236+
..Default::default()
237+
},
238+
cx,
239+
)
240+
})?
241+
.await
242+
}

crates/agent/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ path = "src/agent.rs"
1010

1111
[features]
1212
test-support = ["db/test-support"]
13+
eval = []
14+
edit-agent-eval = []
1315
e2e = []
1416

1517
[lints]

crates/agent/src/edit_agent/evals.rs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use std::{
3131
use util::path;
3232

3333
#[test]
34-
#[cfg_attr(not(feature = "eval"), ignore)]
34+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
3535
fn eval_extract_handle_command_output() {
3636
// Test how well agent generates multiple edit hunks.
3737
//
@@ -108,7 +108,7 @@ fn eval_extract_handle_command_output() {
108108
}
109109

110110
#[test]
111-
#[cfg_attr(not(feature = "eval"), ignore)]
111+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
112112
fn eval_delete_run_git_blame() {
113113
// Model | Pass rate
114114
// ----------------------------|----------
@@ -171,7 +171,7 @@ fn eval_delete_run_git_blame() {
171171
}
172172

173173
#[test]
174-
#[cfg_attr(not(feature = "eval"), ignore)]
174+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
175175
fn eval_translate_doc_comments() {
176176
// Model | Pass rate
177177
// ============================================
@@ -234,7 +234,7 @@ fn eval_translate_doc_comments() {
234234
}
235235

236236
#[test]
237-
#[cfg_attr(not(feature = "eval"), ignore)]
237+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
238238
fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
239239
// Model | Pass rate
240240
// ============================================
@@ -360,7 +360,7 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
360360
}
361361

362362
#[test]
363-
#[cfg_attr(not(feature = "eval"), ignore)]
363+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
364364
fn eval_disable_cursor_blinking() {
365365
// Model | Pass rate
366366
// ============================================
@@ -446,7 +446,7 @@ fn eval_disable_cursor_blinking() {
446446
}
447447

448448
#[test]
449-
#[cfg_attr(not(feature = "eval"), ignore)]
449+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
450450
fn eval_from_pixels_constructor() {
451451
// Results for 2025-06-13
452452
//
@@ -656,7 +656,7 @@ fn eval_from_pixels_constructor() {
656656
}
657657

658658
#[test]
659-
#[cfg_attr(not(feature = "eval"), ignore)]
659+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
660660
fn eval_zode() {
661661
// Model | Pass rate
662662
// ============================================
@@ -763,7 +763,7 @@ fn eval_zode() {
763763
}
764764

765765
#[test]
766-
#[cfg_attr(not(feature = "eval"), ignore)]
766+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
767767
fn eval_add_overwrite_test() {
768768
// Model | Pass rate
769769
// ============================================
@@ -995,7 +995,7 @@ fn eval_add_overwrite_test() {
995995
}
996996

997997
#[test]
998-
#[cfg_attr(not(feature = "eval"), ignore)]
998+
#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
999999
fn eval_create_empty_file() {
10001000
// Check that Edit Agent can create a file without writing its
10011001
// thoughts into it. This issue is not specific to empty files, but
@@ -1490,9 +1490,20 @@ impl EditAgentTest {
14901490
&std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-4-sonnet-latest".into()),
14911491
)
14921492
.unwrap();
1493+
1494+
let authenticate_provider_tasks = cx.update(|cx| {
1495+
LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1496+
registry
1497+
.providers()
1498+
.iter()
1499+
.map(|p| p.authenticate(cx))
1500+
.collect::<Vec<_>>()
1501+
})
1502+
});
14931503
let (agent_model, judge_model) = cx
14941504
.update(|cx| {
14951505
cx.spawn(async move |cx| {
1506+
futures::future::join_all(authenticate_provider_tasks).await;
14961507
let agent_model = Self::load_model(&agent_model, cx).await;
14971508
let judge_model = Self::load_model(&judge_model, cx).await;
14981509
(agent_model.unwrap(), judge_model.unwrap())

crates/agent/src/tests/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1995,7 +1995,7 @@ async fn test_tool_updates_to_completion(cx: &mut TestAppContext) {
19951995
locations: vec![],
19961996
raw_input: Some(json!({})),
19971997
raw_output: None,
1998-
meta: None,
1998+
meta: Some(json!({ "tool_name": "thinking" })),
19991999
}
20002000
);
20012001
let update = expect_tool_call_update_fields(&mut events).await;

crates/agent/src/thread.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,13 @@ impl Thread {
745745

746746
let title = tool.initial_title(tool_use.input.clone(), cx);
747747
let kind = tool.kind();
748-
stream.send_tool_call(&tool_use.id, title, kind, tool_use.input.clone());
748+
stream.send_tool_call(
749+
&tool_use.id,
750+
&tool_use.name,
751+
title,
752+
kind,
753+
tool_use.input.clone(),
754+
);
749755

750756
let output = tool_result
751757
.as_ref()
@@ -1044,14 +1050,18 @@ impl Thread {
10441050
Ok(())
10451051
}
10461052

1047-
pub fn latest_token_usage(&self) -> Option<acp_thread::TokenUsage> {
1053+
pub fn latest_request_token_usage(&self) -> Option<language_model::TokenUsage> {
10481054
let last_user_message = self.last_user_message()?;
10491055
let tokens = self.request_token_usage.get(&last_user_message.id)?;
1050-
let model = self.model.clone()?;
1056+
Some(*tokens)
1057+
}
10511058

1059+
pub fn latest_token_usage(&self) -> Option<acp_thread::TokenUsage> {
1060+
let usage = self.latest_request_token_usage()?;
1061+
let model = self.model.clone()?;
10521062
Some(acp_thread::TokenUsage {
10531063
max_tokens: model.max_token_count_for_mode(self.completion_mode.into()),
1054-
used_tokens: tokens.total_tokens(),
1064+
used_tokens: usage.total_tokens(),
10551065
})
10561066
}
10571067

@@ -1094,6 +1104,14 @@ impl Thread {
10941104
self.run_turn(cx)
10951105
}
10961106

1107+
#[cfg(feature = "eval")]
1108+
pub fn proceed(
1109+
&mut self,
1110+
cx: &mut Context<Self>,
1111+
) -> Result<mpsc::UnboundedReceiver<Result<ThreadEvent>>> {
1112+
self.run_turn(cx)
1113+
}
1114+
10971115
fn run_turn(
10981116
&mut self,
10991117
cx: &mut Context<Self>,
@@ -1461,7 +1479,13 @@ impl Thread {
14611479
});
14621480

14631481
if push_new_tool_use {
1464-
event_stream.send_tool_call(&tool_use.id, title, kind, tool_use.input.clone());
1482+
event_stream.send_tool_call(
1483+
&tool_use.id,
1484+
&tool_use.name,
1485+
title,
1486+
kind,
1487+
tool_use.input.clone(),
1488+
);
14651489
last_message
14661490
.content
14671491
.push(AgentMessageContent::ToolUse(tool_use.clone()));
@@ -2256,13 +2280,15 @@ impl ThreadEventStream {
22562280
fn send_tool_call(
22572281
&self,
22582282
id: &LanguageModelToolUseId,
2283+
tool_name: &str,
22592284
title: SharedString,
22602285
kind: acp::ToolKind,
22612286
input: serde_json::Value,
22622287
) {
22632288
self.0
22642289
.unbounded_send(Ok(ThreadEvent::ToolCall(Self::initial_tool_call(
22652290
id,
2291+
tool_name,
22662292
title.to_string(),
22672293
kind,
22682294
input,
@@ -2272,12 +2298,15 @@ impl ThreadEventStream {
22722298

22732299
fn initial_tool_call(
22742300
id: &LanguageModelToolUseId,
2301+
tool_name: &str,
22752302
title: String,
22762303
kind: acp::ToolKind,
22772304
input: serde_json::Value,
22782305
) -> acp::ToolCall {
22792306
acp::ToolCall {
2280-
meta: None,
2307+
meta: Some(serde_json::json!({
2308+
"tool_name": tool_name
2309+
})),
22812310
id: acp::ToolCallId(id.to_string().into()),
22822311
title,
22832312
kind,

0 commit comments

Comments
 (0)