diff --git a/docs/src/api/params.md b/docs/src/api/params.md index fa45a5c7ed1ca..8eb29d4df8f84 100644 --- a/docs/src/api/params.md +++ b/docs/src/api/params.md @@ -375,7 +375,7 @@ Emulates consistent window screen size available inside web page via `window.scr - `agent` <[Object]> - `provider` <[string]> LLM provider to use - `model` <[string]> Model identifier within provider - - `cacheDir` ?<[string]> Cache folder to use/generate code for performed actions into. Cache is not used if not specified (default). + - `cacheFile` ?<[string]> Cache file to use/generate code for performed actions into. Cache is not used if not specified (default). - `cacheMode` ?<['force'|'ignore'|'auto']> Cache control, defauls to 'auto' Agent settings for [`method: Page.perform`] and [`method: Page.extract`]. diff --git a/packages/playwright-client/types/types.d.ts b/packages/playwright-client/types/types.d.ts index 1f92412180cf3..cd99f7dd5b57f 100644 --- a/packages/playwright-client/types/types.d.ts +++ b/packages/playwright-client/types/types.d.ts @@ -22097,9 +22097,9 @@ export interface BrowserContextOptions { model: string; /** - * Cache folder to use/generate code for performed actions into. Cache is not used if not specified (default). + * Cache file to use/generate code for performed actions into. Cache is not used if not specified (default). */ - cacheDir?: string; + cacheFile?: string; /** * Cache control, defauls to 'auto' diff --git a/packages/playwright-core/src/client/page.ts b/packages/playwright-core/src/client/page.ts index 4618722430825..c7c3dd1a8665f 100644 --- a/packages/playwright-core/src/client/page.ts +++ b/packages/playwright-core/src/client/page.ts @@ -847,11 +847,12 @@ export class Page extends ChannelOwner implements api.Page } async perform(task: string, options: { key?: string, maxTurns?: number } = {}): Promise { - throw new Error('Not implemented in playwright-core'); + await this._channel.perform({ task, ...options }); } - extract(query: string, schema: Schema, options: { maxTurns?: number } = {}): Promise> { - throw new Error('Not implemented in playwright-core'); + async extract(query: string, schema: Schema, options: { maxTurns?: number } = {}): Promise> { + const { result } = await this._channel.extract({ query, schema: this._platform.zodToJsonSchema(schema), ...options }); + return result; } async _snapshotForAI(options: TimeoutOptions & { track?: string } = {}): Promise<{ full: string, incremental?: string }> { diff --git a/packages/playwright-core/src/client/platform.ts b/packages/playwright-core/src/client/platform.ts index 5228e8a3580ad..dd4fd1727ef28 100644 --- a/packages/playwright-core/src/client/platform.ts +++ b/packages/playwright-core/src/client/platform.ts @@ -59,6 +59,7 @@ export type Platform = { streamFile: (path: string, writable: Writable) => Promise, streamReadable: (channel: channels.StreamChannel) => Readable, streamWritable: (channel: channels.WritableStreamChannel) => Writable, + zodToJsonSchema: (schema: any) => any, zones: { empty: Zone, current: () => Zone; }; }; @@ -119,5 +120,9 @@ export const emptyPlatform: Platform = { throw new Error('Streams are not available'); }, + zodToJsonSchema: (schema: any) => { + throw new Error('Zod is not available'); + }, + zones: { empty: noopZone, current: () => noopZone }, }; diff --git a/packages/playwright-core/src/protocol/validator.ts b/packages/playwright-core/src/protocol/validator.ts index 8a438c65aa1a1..0cf1d9f62bb86 100644 --- a/packages/playwright-core/src/protocol/validator.ts +++ b/packages/playwright-core/src/protocol/validator.ts @@ -605,7 +605,7 @@ scheme.BrowserTypeLaunchPersistentContextParams = tObject({ agent: tOptional(tObject({ provider: tString, model: tString, - cacheDir: tOptional(tString), + cacheFile: tOptional(tString), cacheMode: tOptional(tEnum(['ignore', 'force', 'auto'])), })), userDataDir: tString, @@ -703,7 +703,7 @@ scheme.BrowserNewContextParams = tObject({ agent: tOptional(tObject({ provider: tString, model: tString, - cacheDir: tOptional(tString), + cacheFile: tOptional(tString), cacheMode: tOptional(tEnum(['ignore', 'force', 'auto'])), })), proxy: tOptional(tObject({ @@ -780,7 +780,7 @@ scheme.BrowserNewContextForReuseParams = tObject({ agent: tOptional(tObject({ provider: tString, model: tString, - cacheDir: tOptional(tString), + cacheFile: tOptional(tString), cacheMode: tOptional(tEnum(['ignore', 'force', 'auto'])), })), proxy: tOptional(tObject({ @@ -902,7 +902,7 @@ scheme.BrowserContextInitializer = tObject({ agent: tOptional(tObject({ provider: tString, model: tString, - cacheDir: tOptional(tString), + cacheFile: tOptional(tString), cacheMode: tOptional(tEnum(['ignore', 'force', 'auto'])), })), }), @@ -1506,6 +1506,20 @@ scheme.PageUpdateSubscriptionParams = tObject({ enabled: tBoolean, }); scheme.PageUpdateSubscriptionResult = tOptional(tObject({})); +scheme.PagePerformParams = tObject({ + task: tString, + key: tOptional(tString), + maxTurns: tOptional(tInt), +}); +scheme.PagePerformResult = tOptional(tObject({})); +scheme.PageExtractParams = tObject({ + query: tString, + schema: tAny, + maxTurns: tOptional(tInt), +}); +scheme.PageExtractResult = tObject({ + result: tAny, +}); scheme.FrameInitializer = tObject({ url: tString, name: tString, @@ -2797,7 +2811,7 @@ scheme.AndroidDeviceLaunchBrowserParams = tObject({ agent: tOptional(tObject({ provider: tString, model: tString, - cacheDir: tOptional(tString), + cacheFile: tOptional(tString), cacheMode: tOptional(tEnum(['ignore', 'force', 'auto'])), })), pkg: tOptional(tString), diff --git a/packages/playwright-core/src/server/agent/DEPS.list b/packages/playwright-core/src/server/agent/DEPS.list new file mode 100644 index 0000000000000..515385b0887b4 --- /dev/null +++ b/packages/playwright-core/src/server/agent/DEPS.list @@ -0,0 +1,6 @@ +[*] +../browserContext.ts +../page.ts +../progress.ts +../../mcpBundle.ts +../../utilsBundle.ts diff --git a/packages/playwright-core/src/server/agent/actionRunner.ts b/packages/playwright-core/src/server/agent/actionRunner.ts new file mode 100644 index 0000000000000..fdb9f81d23d1a --- /dev/null +++ b/packages/playwright-core/src/server/agent/actionRunner.ts @@ -0,0 +1,52 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import type * as actions from './actions'; +import type { Page } from '../page'; +import type { Progress } from '../progress'; + +export async function runAction(progress: Progress, page: Page, action: actions.Action) { + const frame = page.mainFrame(); + switch (action.method) { + case 'click': + await frame.click(progress, action.selector, { ...action.options, ...strictTrue }); + break; + case 'drag': + await frame.dragAndDrop(progress, action.sourceSelector, action.targetSelector, { ...strictTrue }); + break; + case 'hover': + await frame.hover(progress, action.selector, { ...action.options, ...strictTrue }); + break; + case 'selectOption': + await frame.selectOption(progress, action.selector, [], action.values.map(a => ({ value: a })), { ...strictTrue }); + break; + case 'pressKey': + await page.keyboard.press(progress, action.key); + break; + case 'pressSequentially': + await frame.type(progress, action.selector, action.text, { ...strictTrue }); + if (action.submit) + await page.keyboard.press(progress, 'Enter'); + break; + case 'fill': + await frame.fill(progress, action.selector, action.text, { ...strictTrue }); + if (action.submit) + await page.keyboard.press(progress, 'Enter'); + break; + } +} + +const strictTrue = { strict: true }; diff --git a/packages/playwright-core/src/server/agent/actions.ts b/packages/playwright-core/src/server/agent/actions.ts new file mode 100644 index 0000000000000..030801c0a085c --- /dev/null +++ b/packages/playwright-core/src/server/agent/actions.ts @@ -0,0 +1,62 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import type * as channels from '@protocol/channels'; + +export type ClickAction = { + method: 'click'; + selector: string; + options: Pick; +}; + +export type DragAction = { + method: 'drag'; + sourceSelector: string; + targetSelector: string; +}; + +export type HoverAction = { + method: 'hover'; + selector: string; + options: Pick; +}; + +export type SelectOptionAction = { + method: 'selectOption'; + selector: string; + values: string[]; +}; + +export type PressAction = { + method: 'pressKey'; + key: string; +}; + +export type PressSequentiallyAction = { + method: 'pressSequentially'; + selector: string; + text: string; + submit?: boolean; +}; + +export type FillAction = { + method: 'fill'; + selector: string; + text: string; + submit?: boolean; +}; + +export type Action = ClickAction | DragAction | HoverAction | SelectOptionAction | PressAction | PressSequentiallyAction | FillAction; diff --git a/packages/playwright-core/src/server/agent/agent.ts b/packages/playwright-core/src/server/agent/agent.ts new file mode 100644 index 0000000000000..fe8e2ff11b91b --- /dev/null +++ b/packages/playwright-core/src/server/agent/agent.ts @@ -0,0 +1,124 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import fs from 'fs'; + +import { toolsForLoop } from './backend'; +import { debug } from '../../utilsBundle'; +import { Loop, z, zodToJsonSchema } from '../../mcpBundle'; +import { runAction } from './actionRunner'; +import { Context } from './context'; + +import type { Progress } from '../progress'; +import type * as channels from '@protocol/channels'; +import type { Page } from '../page'; +import type * as loopTypes from '@lowire/loop'; +import type * as actions from './actions'; + +export async function pagePerform(progress: Progress, page: Page, options: channels.PagePerformParams): Promise { + const context = new Context(progress, page); + + if (await cachedPerform(context, options)) + return; + + await perform(context, options.task, zodToJsonSchema(z.object({ + error: z.string().optional().describe('An error message if the task could not be completed successfully'), + })) as loopTypes.Schema, options); + await updateCache(context, options); +} + +export async function pageExtract(progress: Progress, page: Page, options: channels.PageExtractParams) { + const context = new Context(progress, page); + const task = ` +### Instructions +Extract the following information from the page. Do not perform any actions, just extract the information. + +### Query +${options.query}`; + return await perform(context, task, options.schema, options); +} + +async function perform(context: Context, userTask: string, resultSchema: loopTypes.Schema, options: { maxTurns?: number } = {}): Promise { + const { progress, page } = context; + const browserContext = page.browserContext; + if (!browserContext._options.agent) + throw new Error(`page.perform() and page.extract() require the agent to be set on the browser context`); + + const { full } = await page.snapshotForAI(progress); + const { tools, callTool } = toolsForLoop(context); + + const loop = new Loop(browserContext._options.agent.provider as any, { + model: browserContext._options.agent.model, + summarize: true, + debug, + callTool, + tools, + ...options + }); + + const task = `${userTask} + +### Page snapshot +${full} +`; + + return await loop.run(task, { + resultSchema + }); +} + +type CachedActions = Record; + +const allCaches = new Map(); + +async function cachedPerform(context: Context, options: channels.PagePerformParams): Promise { + const agentSettings = context.page.browserContext._options.agent; + if (!agentSettings?.cacheFile || agentSettings.cacheMode === 'ignore') + return false; + + const cache = await cachedActions(agentSettings.cacheFile); + const cacheKey = options.key ?? options.task; + const actions = cache[cacheKey]; + if (!actions) { + if (agentSettings.cacheMode === 'force') + throw new Error(`No cached actions for key "${cacheKey}", but cache mode is set to "force"`); + return false; + } + + for (const action of actions) + await runAction(context.progress, context.page, action); + return true; +} + +async function updateCache(context: Context, options: channels.PagePerformParams) { + const cacheFile = context.page.browserContext._options.agent?.cacheFile; + if (!cacheFile) + return; + const cache = await cachedActions(cacheFile); + const cacheKey = options.key ?? options.task; + cache[cacheKey] = context.actions; + await fs.promises.writeFile(cacheFile, JSON.stringify(cache, undefined, 2)); +} + +async function cachedActions(cacheFile: string): Promise { + let cache = allCaches.get(cacheFile); + if (!cache) { + const text = await fs.promises.readFile(cacheFile, 'utf-8').catch(() => '{}'); + cache = JSON.parse(text) as CachedActions; + allCaches.set(cacheFile, cache); + } + return cache; +} diff --git a/packages/playwright-core/src/server/agent/backend.ts b/packages/playwright-core/src/server/agent/backend.ts new file mode 100644 index 0000000000000..66428b0c708e5 --- /dev/null +++ b/packages/playwright-core/src/server/agent/backend.ts @@ -0,0 +1,62 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import toolDefinitions from './tools'; +import { ProgressController } from '../progress'; +import { zodToJsonSchema } from '../../mcpBundle'; + +import type * as loopTypes from '@lowire/loop'; +import type { Context } from './context'; + +export function toolsForLoop(context: Context): { tools: loopTypes.Tool[], callTool: loopTypes.ToolCallback } { + const tools = toolDefinitions.map(tool => { + const result: loopTypes.Tool = { + name: tool.schema.name, + description: tool.schema.description, + inputSchema: zodToJsonSchema(tool.schema.inputSchema) as loopTypes.Schema, + }; + return result; + }); + + const callTool: loopTypes.ToolCallback = async params => { + const tool = toolDefinitions.find(t => t.schema.name === params.name); + if (!tool) { + return { + content: [{ type: 'text', + text: `Tool ${params.name} not found. Available tools: ${toolDefinitions.map(t => t.schema.name)}` + }], + isError: true, + }; + } + + const progressController = new ProgressController(); + try { + return await progressController.run(async progress => { + return await tool.handle(context, params.arguments); + }); + } catch (error) { + return { + content: [{ type: 'text', text: error.message }], + isError: true, + }; + } + }; + + return { + tools, + callTool, + }; +} diff --git a/packages/playwright-core/src/server/agent/context.ts b/packages/playwright-core/src/server/agent/context.ts new file mode 100644 index 0000000000000..a168cde5edbce --- /dev/null +++ b/packages/playwright-core/src/server/agent/context.ts @@ -0,0 +1,97 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { BrowserContext } from '../browserContext'; +import { runAction } from './actionRunner'; + +import type { Request } from '../network'; +import type * as loopTypes from '@lowire/loop'; +import type * as actions from './actions'; +import type { Page } from '../page'; +import type { Progress } from '../progress'; + +export class Context { + readonly progress: Progress; + readonly page: Page; + readonly actions: actions.Action[] = []; + + constructor(progress: Progress, page: Page) { + this.progress = progress; + this.page = page; + } + + async runAction(action: actions.Action) { + await this.waitForCompletion(() => runAction(this.progress, this.page, action)); + this.actions.push(action); + return await this.snapshotResult(); + } + + async waitForCompletion(callback: () => Promise): Promise { + const requests: Request[] = []; + const requestListener = (request: Request) => requests.push(request); + const disposeListeners = () => { + this.page.browserContext.off(BrowserContext.Events.Request, requestListener); + }; + + let result: R; + try { + result = await callback(); + await this.progress.wait(500); + } finally { + disposeListeners(); + } + + const requestedNavigation = requests.some(request => request.isNavigationRequest()); + if (requestedNavigation) { + await this.page.performActionPreChecks(this.progress); + return result; + } + + const fiveSeconds = new Promise(resolve => setTimeout(resolve, 1000)); + for (const request of requests) { + if (request.failure()) + continue; + const response = Promise.race([request.response(), fiveSeconds]); + await this.progress.race(response); + } + return result; + } + + async snapshotResult(): Promise { + const { full } = await this.page.snapshotForAI(this.progress); + const text = [`# Page snapshot\n${full}`]; + + return { + _meta: { + 'dev.lowire/state': { + 'Page snapshot': full + }, + }, + content: [{ type: 'text', text: text.join('\n\n') }], + }; + } + + async refSelectors(params: { element: string, ref: string }[]): Promise { + return Promise.all(params.map(async param => { + try { + const { resolvedSelector } = await this.page.mainFrame().resolveSelector(this.progress, `aria-ref=${param.ref}`); + return resolvedSelector; + } catch (e) { + throw new Error(`Ref ${param.ref} not found in the current page snapshot. Try capturing new snapshot.`); + } + })); + } +} diff --git a/packages/playwright-core/src/server/agent/tools.ts b/packages/playwright-core/src/server/agent/tools.ts new file mode 100644 index 0000000000000..b7db87a1d76b5 --- /dev/null +++ b/packages/playwright-core/src/server/agent/tools.ts @@ -0,0 +1,216 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from '../../mcpBundle'; + +import type zod from 'zod'; +import type * as loopTypes from '@lowire/loop'; +import type { Context } from './context'; + +type ToolSchema = Omit & { + title: string; + inputSchema: Input; +}; + +type ToolDefinition = { + schema: ToolSchema; + handle: (context: Context, params: zod.output) => Promise; +}; + +function defineTool(tool: ToolDefinition): ToolDefinition { + return tool; +} + +const snapshot = defineTool({ + schema: { + name: 'browser_snapshot', + title: 'Page snapshot', + description: 'Capture accessibility snapshot of the current page, this is better than screenshot', + inputSchema: z.object({}), + }, + + handle: async (context, params) => { + return await context.snapshotResult(); + }, +}); + +const elementSchema = z.object({ + element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'), + ref: z.string().describe('Exact target element reference from the page snapshot'), +}); + +const clickSchema = elementSchema.extend({ + doubleClick: z.boolean().optional().describe('Whether to perform a double click instead of a single click'), + button: z.enum(['left', 'right', 'middle']).optional().describe('Button to click, defaults to left'), + modifiers: z.array(z.enum(['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift'])).optional().describe('Modifier keys to press'), +}); + +const click = defineTool({ + schema: { + name: 'browser_click', + title: 'Click', + description: 'Perform click on a web page', + inputSchema: clickSchema, + }, + + handle: async (context, params) => { + const [selector] = await context.refSelectors([params]); + return await context.runAction({ + method: 'click', + selector, + options: { + button: params.button, + modifiers: params.modifiers, + clickCount: params.doubleClick ? 2 : undefined, + } + }); + }, +}); + +const drag = defineTool({ + schema: { + name: 'browser_drag', + title: 'Drag mouse', + description: 'Perform drag and drop between two elements', + inputSchema: z.object({ + startElement: z.string().describe('Human-readable source element description used to obtain the permission to interact with the element'), + startRef: z.string().describe('Exact source element reference from the page snapshot'), + endElement: z.string().describe('Human-readable target element description used to obtain the permission to interact with the element'), + endRef: z.string().describe('Exact target element reference from the page snapshot'), + }), + }, + + handle: async (context, params) => { + const [sourceSelector, targetSelector] = await context.refSelectors([ + { ref: params.startRef, element: params.startElement }, + { ref: params.endRef, element: params.endElement }, + ]); + + return await context.runAction({ + method: 'drag', + sourceSelector, + targetSelector + }); + }, +}); + +const hoverSchema = elementSchema.extend({ + modifiers: z.array(z.enum(['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift'])).optional().describe('Modifier keys to press'), +}); + +const hover = defineTool({ + schema: { + name: 'browser_hover', + title: 'Hover mouse', + description: 'Hover over element on page', + inputSchema: hoverSchema, + }, + + handle: async (context, params) => { + const [selector] = await context.refSelectors([params]); + return await context.runAction({ + method: 'hover', + selector, + options: { + modifiers: params.modifiers, + } + }); + }, +}); + +const selectOptionSchema = elementSchema.extend({ + values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'), +}); + +const selectOption = defineTool({ + schema: { + name: 'browser_select_option', + title: 'Select option', + description: 'Select an option in a dropdown', + inputSchema: selectOptionSchema, + }, + + handle: async (context, params) => { + const [selector] = await context.refSelectors([params]); + return await context.runAction({ + method: 'selectOption', + selector, + values: params.values + }); + }, +}); + +const pressKey = defineTool({ + schema: { + name: 'browser_press_key', + title: 'Press a key', + description: 'Press a key on the keyboard', + inputSchema: z.object({ + key: z.string().describe('Name of the key to press or a character to generate, such as `ArrowLeft` or `a`'), + }), + }, + + handle: async (context, params) => { + return await context.runAction({ + method: 'pressKey', + key: params.key + }); + }, +}); + +const typeSchema = elementSchema.extend({ + text: z.string().describe('Text to type into the element'), + submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'), + slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'), +}); + +const type = defineTool({ + schema: { + name: 'browser_type', + title: 'Type text', + description: 'Type text into editable element', + inputSchema: typeSchema, + }, + + handle: async (context, params) => { + const [selector] = await context.refSelectors([params]); + if (params.slowly) { + return await context.runAction({ + method: 'pressSequentially', + selector, + text: params.text, + submit: params.submit, + }); + } else { + return await context.runAction({ + method: 'fill', + selector, + text: params.text, + submit: params.submit, + }); + } + }, +}); + +export default [ + snapshot, + click, + drag, + hover, + selectOption, + pressKey, + type, +]; diff --git a/packages/playwright-core/src/server/browserContext.ts b/packages/playwright-core/src/server/browserContext.ts index c00011fbf771d..9edfd3a43f034 100644 --- a/packages/playwright-core/src/server/browserContext.ts +++ b/packages/playwright-core/src/server/browserContext.ts @@ -168,7 +168,7 @@ export abstract class BrowserContext extends SdkObject { } static reusableContextHash(params: channels.BrowserNewContextForReuseParams): string { - const paramsCopy = { ...params, agent: undefined }; + const paramsCopy = { ...params }; if (paramsCopy.selectorEngines?.length === 0) delete paramsCopy.selectorEngines; diff --git a/packages/playwright-core/src/server/dispatchers/browserDispatcher.ts b/packages/playwright-core/src/server/dispatchers/browserDispatcher.ts index bda9e1f42650d..684dcacca91a7 100644 --- a/packages/playwright-core/src/server/dispatchers/browserDispatcher.ts +++ b/packages/playwright-core/src/server/dispatchers/browserDispatcher.ts @@ -77,6 +77,7 @@ export class BrowserDispatcher extends Dispatcher { + delete params.agent; const context = await this._object.newContextForReuse(progress, params); const contextDispatcher = BrowserContextDispatcher.from(this, context); this._dispatchEvent('context', { context: contextDispatcher }); diff --git a/packages/playwright-core/src/server/dispatchers/pageDispatcher.ts b/packages/playwright-core/src/server/dispatchers/pageDispatcher.ts index eb64dbe9579e4..836d6bfffe91e 100644 --- a/packages/playwright-core/src/server/dispatchers/pageDispatcher.ts +++ b/packages/playwright-core/src/server/dispatchers/pageDispatcher.ts @@ -27,6 +27,7 @@ import { RouteDispatcher, WebSocketDispatcher } from './networkDispatchers'; import { WebSocketRouteDispatcher } from './webSocketRouteDispatcher'; import { SdkObject } from '../instrumentation'; import { urlMatches } from '../../utils/isomorphic/urlMatch'; +import { pagePerform, pageExtract } from '../agent/agent'; import type { Artifact } from '../artifact'; import type { BrowserContext } from '../browserContext'; @@ -320,6 +321,14 @@ export class PageDispatcher extends Dispatcher { + await pagePerform(progress, this._page, params); + } + + async extract(params: channels.PageExtractParams, progress: Progress): Promise { + return { result: await pageExtract(progress, this._page, params) }; + } + async requests(params: channels.PageRequestsParams, progress: Progress): Promise { // Send all future requests to the client, so that it can reliably receive all of them. // Otherwise, if subscription is added in a different task from this call (either before or after), diff --git a/packages/playwright-core/src/server/page.ts b/packages/playwright-core/src/server/page.ts index 7ee90d3569dfd..38b73a1f821e6 100644 --- a/packages/playwright-core/src/server/page.ts +++ b/packages/playwright-core/src/server/page.ts @@ -856,7 +856,7 @@ export class Page extends SdkObject { await Promise.all(this.frames().map(frame => frame.hideHighlight().catch(() => {}))); } - async snapshotForAI(progress: Progress, options: { track?: string }): Promise<{ full: string, incremental?: string }> { + async snapshotForAI(progress: Progress, options: { track?: string } = {}): Promise<{ full: string, incremental?: string }> { const snapshot = await snapshotFrameForAI(progress, this.mainFrame(), options); return { full: snapshot.full.join('\n'), incremental: snapshot.incremental?.join('\n') }; } diff --git a/packages/playwright-core/src/server/utils/DEPS.list b/packages/playwright-core/src/server/utils/DEPS.list index 8d1b7a30f27b9..d454f3cbb8e99 100644 --- a/packages/playwright-core/src/server/utils/DEPS.list +++ b/packages/playwright-core/src/server/utils/DEPS.list @@ -1,4 +1,5 @@ [*] +../../mcpBundle.ts ../../utils ../../utils/isomorphic ../../utilsBundle.ts diff --git a/packages/playwright-core/src/server/utils/nodePlatform.ts b/packages/playwright-core/src/server/utils/nodePlatform.ts index 84eb0e4015f8a..942eb0724a2c4 100644 --- a/packages/playwright-core/src/server/utils/nodePlatform.ts +++ b/packages/playwright-core/src/server/utils/nodePlatform.ts @@ -25,6 +25,7 @@ import { colors } from '../../utilsBundle'; import { debugLogger } from './debugLogger'; import { currentZone, emptyZone } from './zones'; import { debugMode, isUnderTest } from './debug'; +import { zodToJsonSchema } from '../../mcpBundle'; import type { Platform, Zone } from '../../client/platform'; import type { Zone as ZoneImpl } from './zones'; @@ -123,6 +124,8 @@ export const nodePlatform: Platform = { return new WritableStreamImpl(channel); }, + zodToJsonSchema, + zones: { current: () => new NodeZone(currentZone()), empty: new NodeZone(emptyZone), diff --git a/packages/playwright-core/src/utils/isomorphic/protocolMetainfo.ts b/packages/playwright-core/src/utils/isomorphic/protocolMetainfo.ts index a190a367d3867..27002daec4b21 100644 --- a/packages/playwright-core/src/utils/isomorphic/protocolMetainfo.ts +++ b/packages/playwright-core/src/utils/isomorphic/protocolMetainfo.ts @@ -138,6 +138,8 @@ export const methodMetainfo = new Map { - page.perform = pagePerform.bind(null, page); - page.extract = pageExtract.bind(null, page); - } - }); -} - -async function pagePerform(page: Page, userTask: string, options: { maxTurns?: number } = {}): Promise { - const resultSchema = { - type: 'object', - properties: { - code: { type: 'string' }, - }, - required: ['code'] - }; - await perform(page, userTask, resultSchema, options); -} - -async function pageExtract(page: Page, query: string, schema: ZodSchema, options: { maxTurns?: number } = {}) { - const task = ` -### Instructions -Extract the following information from the page. Do not perform any actions, just extract the information. - -### Query -${query}`; - return await perform(page, task, zodToJsonSchema(schema), options); -} - -async function perform(page: Page, userTask: string, resultSchema: any, options: { maxTurns?: number } = {}): Promise { - const context = page.context(); - if (!context._options.agent) - throw new Error(`page.perform() and page.extract() require the agent to be set on the browser context`); - - const { full } = await page._snapshotForAI(); - const backend = new BrowserServerBackend(defaultConfig, identityBrowserContextFactory(context)); - const client = await wrapInClient(backend, { name: 'Internal', version: '0.0.0' }); - const callTool: (params: { name: string, arguments: any}) => Promise = async params => { - return await client.callTool(params) as lowireLoop.ToolResult; - }; - - const loop = new Loop(context._options.agent.provider as any, { - model: context._options.agent.model, - summarize: true, - debug, - callTool, - tools: await backend.listTools(), - ...options - }); - - const task = `${userTask} - -### Page snapshot -${full} -`; - - try { - return await loop.run(task, { - resultSchema - }); - } finally { - await client.close(); - } -} diff --git a/packages/protocol/src/channels.d.ts b/packages/protocol/src/channels.d.ts index c4e6e8326118b..e7244adb0953d 100644 --- a/packages/protocol/src/channels.d.ts +++ b/packages/protocol/src/channels.d.ts @@ -1011,7 +1011,7 @@ export type BrowserTypeLaunchPersistentContextParams = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, userDataDir: string, @@ -1099,7 +1099,7 @@ export type BrowserTypeLaunchPersistentContextOptions = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, slowMo?: number, @@ -1226,7 +1226,7 @@ export type BrowserNewContextParams = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, proxy?: { @@ -1300,7 +1300,7 @@ export type BrowserNewContextOptions = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, proxy?: { @@ -1377,7 +1377,7 @@ export type BrowserNewContextForReuseParams = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, proxy?: { @@ -1451,7 +1451,7 @@ export type BrowserNewContextForReuseOptions = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, proxy?: { @@ -1592,7 +1592,7 @@ export type BrowserContextInitializer = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, }, @@ -2111,6 +2111,8 @@ export interface PageChannel extends PageEventTarget, EventTargetChannel { stopCSSCoverage(params?: PageStopCSSCoverageParams, progress?: Progress): Promise; bringToFront(params?: PageBringToFrontParams, progress?: Progress): Promise; updateSubscription(params: PageUpdateSubscriptionParams, progress?: Progress): Promise; + perform(params: PagePerformParams, progress?: Progress): Promise; + extract(params: PageExtractParams, progress?: Progress): Promise; } export type PageBindingCallEvent = { binding: BindingCallChannel, @@ -2614,6 +2616,27 @@ export type PageUpdateSubscriptionOptions = { }; export type PageUpdateSubscriptionResult = void; +export type PagePerformParams = { + task: string, + key?: string, + maxTurns?: number, +}; +export type PagePerformOptions = { + key?: string, + maxTurns?: number, +}; +export type PagePerformResult = void; +export type PageExtractParams = { + query: string, + schema: any, + maxTurns?: number, +}; +export type PageExtractOptions = { + maxTurns?: number, +}; +export type PageExtractResult = { + result: any, +}; export interface PageEvents { 'bindingCall': PageBindingCallEvent; @@ -4881,7 +4904,7 @@ export type AndroidDeviceLaunchBrowserParams = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, pkg?: string, @@ -4953,7 +4976,7 @@ export type AndroidDeviceLaunchBrowserOptions = { agent?: { provider: string, model: string, - cacheDir?: string, + cacheFile?: string, cacheMode?: 'ignore' | 'force' | 'auto', }, pkg?: string, diff --git a/packages/protocol/src/protocol.yml b/packages/protocol/src/protocol.yml index d9540743a7060..f36d34abbfa8c 100644 --- a/packages/protocol/src/protocol.yml +++ b/packages/protocol/src/protocol.yml @@ -594,7 +594,7 @@ ContextOptions: properties: provider: string model: string - cacheDir: string? + cacheFile: string? cacheMode: type: enum? literals: @@ -2021,6 +2021,22 @@ Page: - requestFailed enabled: boolean + perform: + internal: true + parameters: + task: string + key: string? + maxTurns: int? + + extract: + internal: true + parameters: + query: string + schema: json + maxTurns: int? + returns: + result: json + events: bindingCall: diff --git a/tests/library/playwright.config.ts b/tests/library/playwright.config.ts index aa3cfb19a0952..1a958c93c5f90 100644 --- a/tests/library/playwright.config.ts +++ b/tests/library/playwright.config.ts @@ -128,10 +128,6 @@ for (const browserName of browserNames) { executablePath, }, trace: trace ? 'on' : undefined, - agent: { - provider: 'github', - model: 'claude-sonnet-4.5' - } }, metadata: { platform: process.platform, @@ -146,17 +142,26 @@ for (const browserName of browserNames) { } }; - config.projects.push({ + const libraryProject = { name: `${browserName}-library`, testDir: path.join(testDir, 'library'), ...projectTemplate, - }); + }; + config.projects.push(libraryProject); - config.projects.push({ + const pageProject = { name: `${browserName}-page`, testDir: path.join(testDir, 'page'), ...projectTemplate, - }); + }; + pageProject.use.agent = { + provider: 'github', + model: 'claude-sonnet-4.5', + cacheFile: path.join(testDir, 'page', 'agent-cache.json'), + cacheMode: process.env.CI ? 'force' : 'auto', + }; + + config.projects.push(pageProject); } export default config; diff --git a/tests/page/agent-cache.json b/tests/page/agent-cache.json new file mode 100644 index 0000000000000..b3ac1e1e08e9d --- /dev/null +++ b/tests/page/agent-cache.json @@ -0,0 +1,66 @@ +{ + "Fill out the form with the following details:\nName: John Doe\nAddress: 123 Main St, Anytown, XYZ state\nZip Code: 12345\nEmail: john@doe.me": [ + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Full Name *\"i]", + "text": "John Doe" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Email Address *\"i]", + "text": "john@doe.me" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Street Address *\"i]", + "text": "123 Main St" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"City *\"i]", + "text": "Anytown" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"State/Province *\"i]", + "text": "XYZ" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"ZIP/Postal Code *\"i]", + "text": "12345" + } + ], + "Fill out the form with the following details:\nName: John Smith\nAddress: 1045 La Avenida St, Mountain View, CA 94043\nEmail: john.smith@at-microsoft.com": [ + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Full Name *\"i]", + "text": "John Smith" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Email Address *\"i]", + "text": "john.smith@at-microsoft.com" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"Street Address *\"i]", + "text": "1045 La Avenida St" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"City *\"i]", + "text": "Mountain View" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"State/Province *\"i]", + "text": "CA" + }, + { + "method": "fill", + "selector": "internal:role=textbox[name=\"ZIP/Postal Code *\"i]", + "text": "94043" + } + ] +} \ No newline at end of file diff --git a/tests/page/perform-task.spec.ts b/tests/page/perform-task.spec.ts index 908477d57b026..a28df7d411c58 100644 --- a/tests/page/perform-task.spec.ts +++ b/tests/page/perform-task.spec.ts @@ -14,10 +14,26 @@ * limitations under the License. */ -import { test } from './pageTest'; +import { test, expect } from './pageTest'; import z from 'zod'; -test.skip('perform task', async ({ page }) => { +test('page.perform', async ({ page, server }) => { + await page.goto(server.PREFIX + '/evals/fill-form.html'); + await page.perform('Fill out the form with the following details:\n' + + 'Name: John Smith\n' + + 'Address: 1045 La Avenida St, Mountain View, CA 94043\n' + + 'Email: john.smith@at-microsoft.com'); + await expect(page.locator('body')).toMatchAriaSnapshot(` + - textbox "Full Name *": John Smith + - textbox "Email Address *": john.smith@at-microsoft.com + - textbox "Street Address *": 1045 La Avenida St + - textbox "City *": Mountain View + - textbox "State/Province *": CA + - textbox "ZIP/Postal Code *": 94043 + `); +}); + +test.skip('extract task', async ({ page }) => { await page.goto('https://demo.playwright.dev/todomvc'); await page.perform('Add "Buy groceries" todo'); console.log(await page.extract('List todos with their statuses', z.object({