diff --git a/.changeset/shiny-humans-greet.md b/.changeset/shiny-humans-greet.md new file mode 100644 index 00000000..b393251f --- /dev/null +++ b/.changeset/shiny-humans-greet.md @@ -0,0 +1,5 @@ +--- +'@openai/agents-core': minor +--- + +feat: #663 Added helper functions for ComputerBaseInvoke diff --git a/packages/agents-core/src/computer.ts b/packages/agents-core/src/computer.ts index 7aeed599..c35e8e05 100644 --- a/packages/agents-core/src/computer.ts +++ b/packages/agents-core/src/computer.ts @@ -1,18 +1,23 @@ +import * as protocol from './types/protocol'; + export type Environment = 'mac' | 'windows' | 'ubuntu' | 'browser'; export type Button = 'left' | 'right' | 'wheel' | 'back' | 'forward'; import { Expand, SnakeToCamelCase } from './types/helpers'; -import type { ComputerAction } from './types/protocol'; +import { ComputerAction } from './types/protocol'; +import type { RunContext } from './runContext'; type Promisable = T | Promise; /** * Interface to implement for a computer environment to be used by the agent. */ -interface ComputerBase { +type ComputerCommon = { environment: Environment; dimensions: [number, number]; +}; +type ComputerBaseMethods = { screenshot(): Promisable; click(x: number, y: number, button: Button): Promisable; doubleClick(x: number, y: number): Promisable; @@ -27,7 +32,15 @@ interface ComputerBase { move(x: number, y: number): Promisable; keypress(keys: string[]): Promisable; drag(path: [number, number][]): Promisable; -} +}; + +type ComputerBase = ComputerCommon & ComputerBaseMethods & { + invoke?: undefined; +}; + +type ComputerBaseInvoke = ComputerCommon & { + invoke(runContext: RunContext, toolCall: protocol.ComputerUseCallItem): Promisable; +} & { [K in keyof ComputerBaseMethods]: never }; // This turns every snake_case string in the ComputerAction['type'] into a camelCase string type ActionNames = SnakeToCamelCase; @@ -38,5 +51,18 @@ type ActionNames = SnakeToCamelCase; * action names beyond those in `ComputerAction` are present. */ export type Computer = Expand< - ComputerBase & Record, never> + | (ComputerBase & Record, never>) + | (ComputerBaseInvoke & Record, never>) >; + +export function isInvokeComputer(computer: Computer): computer is ComputerBaseInvoke { + return typeof (computer as { invoke?: unknown }).invoke === "function"; +} + +export function asInvokeComputer( + computer: ComputerCommon & { + invoke(runContext: RunContext, toolCall: protocol.ComputerUseCallItem): Promisable + }, +): Computer { + return computer as ComputerBaseInvoke; +} diff --git a/packages/agents-core/src/index.ts b/packages/agents-core/src/index.ts index b993c732..ffd3440a 100644 --- a/packages/agents-core/src/index.ts +++ b/packages/agents-core/src/index.ts @@ -13,7 +13,7 @@ export { ToolUseBehavior, ToolUseBehaviorFlags, } from './agent'; -export { Computer } from './computer'; +export { Computer, asInvokeComputer, isInvokeComputer } from './computer'; export { ShellAction, ShellResult, ShellOutputResult, Shell } from './shell'; export { ApplyPatchOperation, ApplyPatchResult, Editor } from './editor'; export { diff --git a/packages/agents-core/src/runImplementation.ts b/packages/agents-core/src/runImplementation.ts index bd66582e..46adefec 100644 --- a/packages/agents-core/src/runImplementation.ts +++ b/packages/agents-core/src/runImplementation.ts @@ -54,7 +54,7 @@ import { RunItemStreamEvent, RunItemStreamEventName } from './events'; import { RunResult, StreamedRunResult } from './result'; import { z } from 'zod'; import * as protocol from './types/protocol'; -import { Computer } from './computer'; +import { Computer, isInvokeComputer } from './computer'; import type { ApplyPatchResult } from './editor'; import { RunState } from './runState'; import { isZodObject } from './utils'; @@ -1767,7 +1767,11 @@ export async function executeComputerActions( // Run the action and get screenshot let output: string; try { - output = await _runComputerActionAndScreenshot(computer, toolCall); + if (isInvokeComputer(computer)) { + output = await computer.invoke(runContext, toolCall); + } else { + output = await _runComputerActionAndScreenshot(computer, toolCall); + } } catch (err) { _logger.error('Failed to execute computer action:', err); output = ''; diff --git a/packages/agents-core/test/runImplementation.test.ts b/packages/agents-core/test/runImplementation.test.ts index 68ffb987..fdd2e005 100644 --- a/packages/agents-core/test/runImplementation.test.ts +++ b/packages/agents-core/test/runImplementation.test.ts @@ -48,7 +48,7 @@ import { } from '../src/tool'; import { handoff } from '../src/handoff'; import { ModelBehaviorError, UserError } from '../src/errors'; -import { Computer } from '../src/computer'; +import { Computer, asInvokeComputer } from '../src/computer'; import { Usage } from '../src/usage'; import { setTracingDisabled, withTrace } from '../src'; @@ -1855,7 +1855,7 @@ describe('executeFunctionToolCalls', () => { }); }); -describe('executeComputerActions', () => { +describe('executeComputerActions - original', () => { function makeComputer(): Computer { return { environment: 'mac', @@ -1953,6 +1953,88 @@ describe('executeComputerActions', () => { }); }); +describe('executeComputerActions - invoke', () => { + function makeComputer(): Computer { + return asInvokeComputer({ + environment: 'mac', + dimensions: [1, 1], + invoke: vi.fn(async () => 'img') + }); + } + + const actions: protocol.ComputerAction[] = [ + { type: 'click', x: 1, y: 2, button: 'left' }, + { type: 'double_click', x: 2, y: 2 }, + { type: 'drag', path: [{ x: 1, y: 1 }] }, + { type: 'keypress', keys: ['a'] }, + { type: 'move', x: 3, y: 3 }, + { type: 'screenshot' }, + { type: 'scroll', x: 0, y: 0, scroll_x: 0, scroll_y: 1 }, + { type: 'type', text: 'hi' }, + { type: 'wait' }, + ]; + + it('invokes computer methods and returns screenshots', async () => { + const comp = makeComputer(); + const tool = computerTool({ computer: comp }); + const calls = actions.map((a, i) => ({ + toolCall: { + id: `id${i}`, + type: 'computer_call', + callId: `id${i}`, + status: 'completed', + action: a, + } as protocol.ComputerUseCallItem, + computer: tool, + })); + + const result = await withTrace('test', () => + executeComputerActions( + new Agent({ name: 'C' }), + calls, + new Runner({ tracingDisabled: true }), + new RunContext(), + ), + ); + + expect(result).toHaveLength(actions.length); + expect(result.every((r) => r instanceof ToolCallOutputItem)).toBe(true); + }); + + it('throws if computer lacks screenshot', async () => { + const comp: any = { + environment: 'mac', + dimensions: [1, 1], + invoke: vi.fn(async () => null) + }; + const tool = computerTool({ computer: comp }); + const call = { + toolCall: { + id: 'id', + type: 'computer_call', + callId: 'id', + status: 'completed', + action: { type: 'click', x: 1, y: 1, button: 'left' }, + } as protocol.ComputerUseCallItem, + computer: tool, + }; + const res = await withTrace('test', () => + executeComputerActions( + new Agent({ name: 'C' }), + [call], + new Runner({ tracingDisabled: true }), + new RunContext(), + { error: (_: string) => {} } as unknown as Logger, + ), + ); + + expect(res[0]).toBeInstanceOf(ToolCallOutputItem); + expect(res[0].type).toBe('tool_call_output_item'); + expect(res[0].rawItem.type).toBe('computer_call_result'); + expect((res[0].rawItem as any).output.data).toBe(''); + }); +}); + describe('executeHandoffCalls', () => { it('executes single handoff', async () => { const target = new Agent({ name: 'Target' });