diff --git a/src/core/prompts/tools/native-tools/__tests__/read_file.spec.ts b/src/core/prompts/tools/native-tools/__tests__/read_file.spec.ts index 92c2c9f5db6..9561fe417d0 100644 --- a/src/core/prompts/tools/native-tools/__tests__/read_file.spec.ts +++ b/src/core/prompts/tools/native-tools/__tests__/read_file.spec.ts @@ -96,6 +96,48 @@ describe("createReadFileTool", () => { }) }) + describe("supportsImages option", () => { + it("should include image format documentation when supportsImages is true", () => { + const tool = createReadFileTool({ supportsImages: true }) + const description = getFunctionDef(tool).description + + expect(description).toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + }) + + it("should not include image format documentation when supportsImages is false", () => { + const tool = createReadFileTool({ supportsImages: false }) + const description = getFunctionDef(tool).description + + expect(description).not.toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + expect(description).toContain("may not handle other binary files properly") + }) + + it("should default supportsImages to false", () => { + const tool = createReadFileTool({}) + const description = getFunctionDef(tool).description + + expect(description).not.toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + }) + + it("should always include PDF and DOCX support in description", () => { + const toolWithImages = createReadFileTool({ supportsImages: true }) + const toolWithoutImages = createReadFileTool({ supportsImages: false }) + + expect(getFunctionDef(toolWithImages).description).toContain( + "Supports text extraction from PDF and DOCX files", + ) + expect(getFunctionDef(toolWithoutImages).description).toContain( + "Supports text extraction from PDF and DOCX files", + ) + }) + }) + describe("combined options", () => { it("should correctly combine low maxConcurrentFileReads with partialReadsEnabled", () => { const tool = createReadFileTool({ @@ -120,6 +162,49 @@ describe("createReadFileTool", () => { expect(description).not.toContain("line_ranges") expect(description).not.toContain("Example multiple files") }) + + it("should correctly combine partialReadsEnabled and supportsImages", () => { + const tool = createReadFileTool({ + partialReadsEnabled: true, + supportsImages: true, + }) + const description = getFunctionDef(tool).description + + // Should have both line_ranges and image support + expect(description).toContain("line_ranges") + expect(description).toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + }) + + it("should work with partialReadsEnabled=false and supportsImages=true", () => { + const tool = createReadFileTool({ + partialReadsEnabled: false, + supportsImages: true, + }) + const description = getFunctionDef(tool).description + + // Should have image support but no line_ranges + expect(description).not.toContain("line_ranges") + expect(description).toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + }) + + it("should correctly combine all three options", () => { + const tool = createReadFileTool({ + maxConcurrentFileReads: 3, + partialReadsEnabled: true, + supportsImages: true, + }) + const description = getFunctionDef(tool).description + + expect(description).toContain("maximum of 3 files") + expect(description).toContain("line_ranges") + expect(description).toContain( + "Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis", + ) + }) }) describe("tool structure", () => { diff --git a/src/core/prompts/tools/native-tools/index.ts b/src/core/prompts/tools/native-tools/index.ts index aaa89087336..4f78729cdc8 100644 --- a/src/core/prompts/tools/native-tools/index.ts +++ b/src/core/prompts/tools/native-tools/index.ts @@ -33,6 +33,8 @@ export interface NativeToolsOptions { partialReadsEnabled?: boolean /** Maximum number of files that can be read in a single read_file request (default: 5) */ maxConcurrentFileReads?: number + /** Whether the model supports image processing (default: false) */ + supportsImages?: boolean } /** @@ -42,11 +44,12 @@ export interface NativeToolsOptions { * @returns Array of native tool definitions */ export function getNativeTools(options: NativeToolsOptions = {}): OpenAI.Chat.ChatCompletionTool[] { - const { partialReadsEnabled = true, maxConcurrentFileReads = 5 } = options + const { partialReadsEnabled = true, maxConcurrentFileReads = 5, supportsImages = false } = options const readFileOptions: ReadFileToolOptions = { partialReadsEnabled, maxConcurrentFileReads, + supportsImages, } return [ diff --git a/src/core/prompts/tools/native-tools/read_file.ts b/src/core/prompts/tools/native-tools/read_file.ts index cfb0b8bbe11..7171be0f1d6 100644 --- a/src/core/prompts/tools/native-tools/read_file.ts +++ b/src/core/prompts/tools/native-tools/read_file.ts @@ -1,6 +1,17 @@ import type OpenAI from "openai" -const READ_FILE_SUPPORTS_NOTE = `Supports text extraction from PDF and DOCX files, but may not handle other binary files properly.` +/** + * Generates the file support note, optionally including image format support. + * + * @param supportsImages - Whether the model supports image processing + * @returns Support note string + */ +function getReadFileSupportsNote(supportsImages: boolean): string { + if (supportsImages) { + return `Supports text extraction from PDF and DOCX files. Automatically processes and returns image files (PNG, JPG, JPEG, GIF, BMP, SVG, WEBP, ICO, AVIF) for visual analysis. May not handle other binary files properly.` + } + return `Supports text extraction from PDF and DOCX files, but may not handle other binary files properly.` +} /** * Options for creating the read_file tool definition. @@ -10,6 +21,8 @@ export interface ReadFileToolOptions { partialReadsEnabled?: boolean /** Maximum number of files that can be read in a single request (default: 5) */ maxConcurrentFileReads?: number + /** Whether the model supports image processing (default: false) */ + supportsImages?: boolean } /** @@ -20,7 +33,7 @@ export interface ReadFileToolOptions { * @returns Native tool definition for read_file */ export function createReadFileTool(options: ReadFileToolOptions = {}): OpenAI.Chat.ChatCompletionTool { - const { partialReadsEnabled = true, maxConcurrentFileReads = 5 } = options + const { partialReadsEnabled = true, maxConcurrentFileReads = 5, supportsImages = false } = options const isMultipleReadsEnabled = maxConcurrentFileReads > 1 // Build description intro with concurrent reads limit message @@ -50,7 +63,8 @@ export function createReadFileTool(options: ReadFileToolOptions = {}): OpenAI.Ch ? `Example multiple files (within ${maxConcurrentFileReads}-file limit): { files: [{ path: 'file1.ts' }, { path: 'file2.ts' }] }` : "") - const description = baseDescription + optionalRangesDescription + READ_FILE_SUPPORTS_NOTE + " " + examples + const description = + baseDescription + optionalRangesDescription + getReadFileSupportsNote(supportsImages) + " " + examples // Build the properties object conditionally const fileProperties: Record = { diff --git a/src/core/task/build-tools.ts b/src/core/task/build-tools.ts index d3e43472b6c..52a9f2eb82f 100644 --- a/src/core/task/build-tools.ts +++ b/src/core/task/build-tools.ts @@ -64,10 +64,14 @@ export async function buildNativeToolsArray(options: BuildToolsOptions): Promise // Determine if partial reads are enabled based on maxReadFileLine setting. const partialReadsEnabled = maxReadFileLine !== -1 + // Check if the model supports images for read_file tool description. + const supportsImages = modelInfo?.supportsImages ?? false + // Build native tools with dynamic read_file tool based on settings. const nativeTools = getNativeTools({ partialReadsEnabled, maxConcurrentFileReads, + supportsImages, }) // Filter native tools based on mode restrictions.