Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/types/src/tool-params.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ export interface Size {
}

export interface BrowserActionParams {
action: "launch" | "click" | "hover" | "type" | "scroll_down" | "scroll_up" | "resize" | "close"
action: "launch" | "click" | "hover" | "type" | "scroll_down" | "scroll_up" | "resize" | "close" | "screenshot"
url?: string
coordinate?: Coordinate
size?: Size
text?: string
path?: string
}

export interface GenerateImageParams {
Expand Down
2 changes: 2 additions & 0 deletions src/core/assistant-message/NativeToolCallParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ export class NativeToolCallParser {
coordinate: partialArgs.coordinate,
size: partialArgs.size,
text: partialArgs.text,
path: partialArgs.path,
}
}
break
Expand Down Expand Up @@ -645,6 +646,7 @@ export class NativeToolCallParser {
coordinate: args.coordinate,
size: args.size,
text: args.text,
path: args.path,
} as NativeArgsFor<TName>
}
break
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions src/core/prompts/tools/browser-action.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ Parameters:
- Use with the \`size\` parameter to specify the new size.
* scroll_down: Scroll down the page by one page height.
* scroll_up: Scroll up the page by one page height.
* screenshot: Take a screenshot and save it to a file.
- Use with the \`path\` parameter to specify the destination file path.
- Supported formats: .png, .jpeg, .webp
- Example: \`<action>screenshot</action>\` with \`<path>screenshots/result.png</path>\`
* close: Close the Puppeteer-controlled browser instance. This **must always be the final browser action**.
- Example: \`<action>close</action>\`
- url: (optional) Use this for providing the URL for the \`launch\` action.
Expand All @@ -56,6 +60,9 @@ Parameters:
* Example: <size>1280,720</size>
- text: (optional) Use this for providing the text for the \`type\` action.
* Example: <text>Hello, world!</text>
- path: (optional) File path for the \`screenshot\` action. Path is relative to the workspace.
* Supported formats: .png, .jpeg, .webp
* Example: <path>screenshots/my-screenshot.png</path>
Usage:
<browser_action>
<action>Action to perform (e.g., launch, click, type, press, scroll_down, scroll_up, close)</action>
Expand All @@ -74,5 +81,11 @@ Example: Requesting to click on the element at coordinates 450,300 on a 1024x768
<browser_action>
<action>click</action>
<coordinate>450,300@1024x768</coordinate>
</browser_action>

Example: Taking a screenshot and saving it to a file
<browser_action>
<action>screenshot</action>
<path>screenshots/result.png</path>
</browser_action>`
}
19 changes: 18 additions & 1 deletion src/core/prompts/tools/native-tools/browser_action.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ const SIZE_PARAMETER_DESCRIPTION = `Viewport dimensions for the resize action in

const TEXT_PARAMETER_DESCRIPTION = `Text to type when performing the type action, or key name to press when performing the press action (e.g., 'Enter', 'Tab', 'Escape')`

const PATH_PARAMETER_DESCRIPTION = `File path where the screenshot should be saved (relative to workspace). Required for screenshot action. Supports .png, .jpeg, and .webp extensions. Example: 'screenshots/result.png'`

export default {
type: "function",
function: {
Expand All @@ -33,7 +35,18 @@ export default {
action: {
type: "string",
description: ACTION_PARAMETER_DESCRIPTION,
enum: ["launch", "click", "hover", "type", "press", "scroll_down", "scroll_up", "resize", "close"],
enum: [
"launch",
"click",
"hover",
"type",
"press",
"scroll_down",
"scroll_up",
"resize",
"close",
"screenshot",
],
},
url: {
type: ["string", "null"],
Expand All @@ -51,6 +64,10 @@ export default {
type: ["string", "null"],
description: TEXT_PARAMETER_DESCRIPTION,
},
path: {
type: ["string", "null"],
description: PATH_PARAMETER_DESCRIPTION,
},
},
required: ["action"],
additionalProperties: false,
Expand Down
23 changes: 21 additions & 2 deletions src/core/tools/BrowserActionTool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export async function browserActionTool(
const coordinate: string | undefined = block.params.coordinate
const text: string | undefined = block.params.text
const size: string | undefined = block.params.size
const filePath: string | undefined = block.params.path

if (!action || !browserActions.includes(action)) {
// checking for action to ensure it is complete and valid
Expand Down Expand Up @@ -155,6 +156,17 @@ export async function browserActionTool(
}
}

if (action === "screenshot") {
if (!filePath) {
cline.consecutiveMistakeCount++
cline.recordToolError("browser_action")
cline.didToolFailInCurrentTurn = true
pushToolResult(await cline.sayAndCreateMissingParamError("browser_action", "path"))
// Do not close the browser on parameter validation errors
return
}
}

cline.consecutiveMistakeCount = 0

// Prepare say payload; include executedCoordinate for pointer actions
Expand Down Expand Up @@ -191,6 +203,9 @@ export async function browserActionTool(
case "resize":
browserActionResult = await cline.browserSession.resize(size!)
break
case "screenshot":
browserActionResult = await cline.browserSession.saveScreenshot(filePath!, cline.cwd)
break
case "close":
browserActionResult = await cline.browserSession.closeBrowser()
break
Expand All @@ -205,12 +220,16 @@ export async function browserActionTool(
case "press":
case "scroll_down":
case "scroll_up":
case "resize": {
case "resize":
case "screenshot": {
await cline.say("browser_action_result", JSON.stringify(browserActionResult))

const images = browserActionResult?.screenshot ? [browserActionResult.screenshot] : []

let messageText = `The browser action has been executed.`
let messageText =
action === "screenshot"
? `Screenshot saved to: ${filePath}`
: `The browser action has been executed.`

messageText += `\n\n**CRITICAL**: When providing click/hover coordinates:`
messageText += `\n1. Screenshot dimensions != Browser viewport dimensions`
Expand Down
27 changes: 27 additions & 0 deletions src/core/tools/__tests__/BrowserActionTool.screenshot.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Test screenshot action functionality in browser actions
import { describe, it, expect } from "vitest"
import { browserActions } from "../../../shared/ExtensionMessage"

describe("Browser Action Screenshot", () => {
describe("browserActions array", () => {
it("should include screenshot action", () => {
expect(browserActions).toContain("screenshot")
})

it("should have screenshot as a valid browser action type", () => {
const allActions = [
"launch",
"click",
"hover",
"type",
"press",
"scroll_down",
"scroll_up",
"resize",
"close",
"screenshot",
]
expect(browserActions).toEqual(allActions)
})
})
})
49 changes: 49 additions & 0 deletions src/services/browser/BrowserSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,55 @@ export class BrowserSession {
})
}

/**
* Determines image type from file extension
*/
private getImageTypeFromPath(filePath: string): "png" | "jpeg" | "webp" {
const ext = path.extname(filePath).toLowerCase()
if (ext === ".jpg" || ext === ".jpeg") return "jpeg"
if (ext === ".webp") return "webp"
return "png"
}

/**
* Takes a screenshot and saves it to the specified file path.
* @param filePath - The destination file path (relative to workspace)
* @param cwd - Current working directory for resolving relative paths
* @returns BrowserActionResult with screenshot data and saved file path
* @throws Error if the resolved path escapes the workspace directory
*/
async saveScreenshot(filePath: string, cwd: string): Promise<BrowserActionResult> {
// Always resolve the path against the workspace root
const normalizedCwd = path.resolve(cwd)
const fullPath = path.resolve(cwd, filePath)

// Validate that the resolved path stays within the workspace (before calling doAction)
if (!fullPath.startsWith(normalizedCwd + path.sep) && fullPath !== normalizedCwd) {
throw new Error(
`Screenshot path "${filePath}" resolves to "${fullPath}" which is outside the workspace "${normalizedCwd}". ` +
`Paths must be relative to the workspace and cannot escape it.`,
)
}

return this.doAction(async (page) => {
// Ensure directory exists
await fs.mkdir(path.dirname(fullPath), { recursive: true })

// Determine image type from extension
const imageType = this.getImageTypeFromPath(filePath)

// Take screenshot directly to file (more efficient than base64 for file saving)
await page.screenshot({
path: fullPath,
type: imageType,
quality:
imageType === "png"
? undefined
: ((this.context.globalState.get("screenshotQuality") as number | undefined) ?? 75),
})
})
}

/**
* Draws a cursor indicator on the page at the specified position
*/
Expand Down
Loading
Loading