diff --git a/src/core/tools/ReadFileTool.ts b/src/core/tools/ReadFileTool.ts index d21c8cd247a..17282df7744 100644 --- a/src/core/tools/ReadFileTool.ts +++ b/src/core/tools/ReadFileTool.ts @@ -25,7 +25,7 @@ import { processImageFile, ImageMemoryTracker, } from "./helpers/imageHelpers" -import { validateFileTokenBudget, truncateFileContent } from "./helpers/fileTokenBudget" +import { FILE_READ_BUDGET_PERCENT, readFileWithTokenBudget } from "./helpers/fileTokenBudget" import { truncateDefinitionsToLineLimit } from "./helpers/truncateDefinitions" import { BaseTool, ToolCallbacks } from "./BaseTool" import type { ToolUse } from "../../shared/tools" @@ -386,7 +386,38 @@ export class ReadFileTool extends BaseTool<"read_file"> { } if (supportedBinaryFormats && supportedBinaryFormats.includes(fileExtension)) { - // Fall through to extractTextFromFile + // Use extractTextFromFile for supported binary formats (PDF, DOCX, etc.) + try { + const content = await extractTextFromFile(fullPath) + const numberedContent = addLineNumbers(content) + const lines = content.split("\n") + const lineCount = lines.length + const lineRangeAttr = lineCount > 0 ? ` lines="1-${lineCount}"` : "" + + await task.fileContextTracker.trackFileContext(relPath, "read_tool" as RecordSource) + + updateFileResult(relPath, { + xmlContent: + lineCount > 0 + ? `${relPath}\n\n${numberedContent}\n` + : `${relPath}\nFile is empty\n`, + nativeContent: + lineCount > 0 + ? `File: ${relPath}\nLines 1-${lineCount}:\n${numberedContent}` + : `File: ${relPath}\nNote: File is empty`, + }) + continue + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error) + updateFileResult(relPath, { + status: "error", + error: `Error extracting text: ${errorMsg}`, + xmlContent: `${relPath}Error extracting text: ${errorMsg}`, + nativeContent: `File: ${relPath}\nError: Error extracting text: ${errorMsg}`, + }) + await task.say("error", `Error extracting text from ${relPath}: ${errorMsg}`) + continue + } } else { const fileFormat = fileExtension.slice(1) || "bin" updateFileResult(relPath, { @@ -492,48 +523,54 @@ export class ReadFileTool extends BaseTool<"read_file"> { settings: task.apiConfiguration, }) ?? ANTHROPIC_DEFAULT_MAX_TOKENS - const budgetResult = await validateFileTokenBudget( - fullPath, - contextWindow - maxOutputTokens, - contextTokens || 0, - ) + // Calculate available token budget (60% of remaining context) + const remainingTokens = contextWindow - maxOutputTokens - (contextTokens || 0) + const safeReadBudget = Math.floor(remainingTokens * FILE_READ_BUDGET_PERCENT) - let content = await extractTextFromFile(fullPath) + let content: string let xmlInfo = "" - let nativeInfo = "" - if (budgetResult.shouldTruncate && budgetResult.maxChars !== undefined) { - const truncateResult = truncateFileContent( - content, - budgetResult.maxChars, - content.length, - budgetResult.isPreview, - ) - content = truncateResult.content - - let displayedLines = content.length === 0 ? 0 : content.split(/\r?\n/).length - if (displayedLines > 0 && content.endsWith("\n")) { - displayedLines-- - } - const lineRangeAttr = displayedLines > 0 ? ` lines="1-${displayedLines}"` : "" - xmlInfo = - content.length > 0 ? `\n${content}\n` : `` - xmlInfo += `${truncateResult.notice}\n` - - nativeInfo = - content.length > 0 - ? `Lines 1-${displayedLines}:\n${content}\n\nNote: ${truncateResult.notice}` - : `Note: ${truncateResult.notice}` + if (safeReadBudget <= 0) { + // No budget available + content = "" + const notice = "No available context budget for file reading" + xmlInfo = `\n${notice}\n` + nativeInfo = `Note: ${notice}` } else { - const lineRangeAttr = ` lines="1-${totalLines}"` - xmlInfo = totalLines > 0 ? `\n${content}\n` : `` + // Read file with incremental token counting + const result = await readFileWithTokenBudget(fullPath, { + budgetTokens: safeReadBudget, + }) - if (totalLines === 0) { - xmlInfo += `File is empty\n` - nativeInfo = "Note: File is empty" + content = addLineNumbers(result.content) + + if (!result.complete) { + // File was truncated + const notice = `File truncated: showing ${result.lineCount} lines (${result.tokenCount} tokens) due to context budget. Use line_range to read specific sections.` + const lineRangeAttr = result.lineCount > 0 ? ` lines="1-${result.lineCount}"` : "" + xmlInfo = + result.lineCount > 0 + ? `\n${content}\n${notice}\n` + : `\n${notice}\n` + nativeInfo = + result.lineCount > 0 + ? `Lines 1-${result.lineCount}:\n${content}\n\nNote: ${notice}` + : `Note: ${notice}` } else { - nativeInfo = `Lines 1-${totalLines}:\n${content}` + // Full file read + const lineRangeAttr = ` lines="1-${result.lineCount}"` + xmlInfo = + result.lineCount > 0 + ? `\n${content}\n` + : `` + + if (result.lineCount === 0) { + xmlInfo += `File is empty\n` + nativeInfo = "Note: File is empty" + } else { + nativeInfo = `Lines 1-${result.lineCount}:\n${content}` + } } } diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index d109a6d430a..d22c163636f 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -36,20 +36,29 @@ vi.mock("fs/promises", () => fsPromises) // Mock input content for tests let mockInputContent = "" +// Create hoisted mocks that can be used in vi.mock factories +const { addLineNumbersMock, mockReadFileWithTokenBudget } = vi.hoisted(() => { + const addLineNumbersMock = vi.fn().mockImplementation((text: string, startLine = 1) => { + if (!text) return "" + const lines = typeof text === "string" ? text.split("\n") : [text] + return lines.map((line: string, i: number) => `${startLine + i} | ${line}`).join("\n") + }) + const mockReadFileWithTokenBudget = vi.fn() + return { addLineNumbersMock, mockReadFileWithTokenBudget } +}) + // First create all the mocks vi.mock("../../../integrations/misc/extract-text", () => ({ extractTextFromFile: vi.fn(), - addLineNumbers: vi.fn(), + addLineNumbers: addLineNumbersMock, getSupportedBinaryFormats: vi.fn(() => [".pdf", ".docx", ".ipynb"]), })) vi.mock("../../../services/tree-sitter") -// Then create the mock functions -const addLineNumbersMock = vi.fn().mockImplementation((text, startLine = 1) => { - if (!text) return "" - const lines = typeof text === "string" ? text.split("\n") : [text] - return lines.map((line, i) => `${startLine + i} | ${line}`).join("\n") -}) +// Mock readFileWithTokenBudget - must be mocked to prevent actual file system access +vi.mock("../../../integrations/misc/read-file-with-budget", () => ({ + readFileWithTokenBudget: (...args: any[]) => mockReadFileWithTokenBudget(...args), +})) const extractTextFromFileMock = vi.fn() const getSupportedBinaryFormatsMock = vi.fn(() => [".pdf", ".docx", ".ipynb"]) @@ -145,6 +154,27 @@ beforeEach(() => { }) : [] }) + + // Reset addLineNumbers mock to its default implementation (prevents cross-test pollution) + addLineNumbersMock.mockReset() + addLineNumbersMock.mockImplementation((text: string, startLine = 1) => { + if (!text) return "" + const lines = typeof text === "string" ? text.split("\n") : [text] + return lines.map((line: string, i: number) => `${startLine + i} | ${line}`).join("\n") + }) + + // Reset readFileWithTokenBudget mock with default implementation + mockReadFileWithTokenBudget.mockClear() + mockReadFileWithTokenBudget.mockImplementation(async (_filePath: string, _options: any) => { + // Default: return the mockInputContent with 5 lines + const lines = mockInputContent ? mockInputContent.split("\n") : [] + return { + content: mockInputContent, + tokenCount: mockInputContent.length / 4, // rough estimate + lineCount: lines.length, + complete: true, + } + }) }) // Mock i18n translation function @@ -496,7 +526,16 @@ describe("read_file tool with maxReadFileLine setting", () => { it("should read with extractTextFromFile when file has few lines", async () => { // Setup mockedCountFileLines.mockResolvedValue(3) // File shorter than maxReadFileLine - mockInputContent = fileContent + const threeLineContent = "Line 1\nLine 2\nLine 3" + mockInputContent = threeLineContent + + // Configure the mock to return the correct content for this test + mockReadFileWithTokenBudget.mockResolvedValueOnce({ + content: threeLineContent, + tokenCount: threeLineContent.length / 4, + lineCount: 3, + complete: true, + }) // Execute const result = await executeReadFileTool({}, { maxReadFileLine: 5, totalLines: 3 }) @@ -656,11 +695,15 @@ describe("read_file tool XML output structure", () => { it("should produce XML output with no unnecessary indentation", async () => { // Setup const numberedContent = "1 | Line 1\n2 | Line 2\n3 | Line 3\n4 | Line 4\n5 | Line 5" - // For XML structure test - mockedExtractTextFromFile.mockImplementation(() => { - addLineNumbersMock(mockInputContent) - return Promise.resolve(numberedContent) + + // Configure mockReadFileWithTokenBudget to return the 5-line content + mockReadFileWithTokenBudget.mockResolvedValueOnce({ + content: fileContent, // "Line 1\nLine 2\nLine 3\nLine 4\nLine 5" + tokenCount: fileContent.length / 4, + lineCount: 5, + complete: true, }) + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1, maxImageFileSize: 20, @@ -693,7 +736,15 @@ describe("read_file tool XML output structure", () => { it("should handle empty files correctly", async () => { // Setup mockedCountFileLines.mockResolvedValue(0) - mockedExtractTextFromFile.mockResolvedValue("") + + // Configure mockReadFileWithTokenBudget to return empty content + mockReadFileWithTokenBudget.mockResolvedValueOnce({ + content: "", + tokenCount: 0, + lineCount: 0, + complete: true, + }) + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1, maxImageFileSize: 20, diff --git a/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts b/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts deleted file mode 100644 index 4eea6435a89..00000000000 --- a/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts +++ /dev/null @@ -1,357 +0,0 @@ -import { describe, it, expect, vi, beforeEach, afterEach } from "vitest" -import { - validateFileTokenBudget, - truncateFileContent, - FILE_SIZE_THRESHOLD, - MAX_FILE_SIZE_FOR_TOKENIZATION, - PREVIEW_SIZE_FOR_LARGE_FILES, -} from "../fileTokenBudget" - -// Mock dependencies -vi.mock("fs/promises", () => ({ - stat: vi.fn(), - readFile: vi.fn(), - open: vi.fn(), -})) - -vi.mock("../../../../utils/countTokens", () => ({ - countTokens: vi.fn(), -})) - -// Import after mocking -const fs = await import("fs/promises") -const { countTokens } = await import("../../../../utils/countTokens") - -const mockStat = vi.mocked(fs.stat) -const mockReadFile = vi.mocked(fs.readFile) -const mockOpen = vi.mocked(fs.open) -const mockCountTokens = vi.mocked(countTokens) - -describe("fileTokenBudget", () => { - beforeEach(() => { - vi.clearAllMocks() - mockOpen.mockReset() - }) - - afterEach(() => { - vi.restoreAllMocks() - }) - - describe("validateFileTokenBudget", () => { - it("should not truncate files smaller than FILE_SIZE_THRESHOLD", async () => { - const filePath = "/test/small-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - - // Mock file stats - small file (50KB) - mockStat.mockResolvedValue({ - size: 50000, - } as any) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(false) - expect(mockReadFile).not.toHaveBeenCalled() - expect(mockCountTokens).not.toHaveBeenCalled() - }) - - it("should validate and not truncate large files that fit within budget", async () => { - const filePath = "/test/large-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "x".repeat(150000) // 150KB file - - // Mock file stats - large file (150KB) - mockStat.mockResolvedValue({ - size: 150000, - } as any) - - // Mock file read - mockReadFile.mockResolvedValue(fileContent) - - // Mock token counting - file uses 30k tokens (within 60% of 190k remaining = 114k budget) - mockCountTokens.mockResolvedValue(30000) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(false) - expect(mockReadFile).toHaveBeenCalledWith(filePath, "utf-8") - expect(mockCountTokens).toHaveBeenCalled() - }) - - it("should truncate large files that exceed token budget", async () => { - const filePath = "/test/huge-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "x".repeat(500000) // 500KB file - - // Mock file stats - huge file (500KB) - mockStat.mockResolvedValue({ - size: 500000, - } as any) - - // Mock file read - mockReadFile.mockResolvedValue(fileContent) - - // Mock token counting - file uses 150k tokens (exceeds 60% of 190k remaining = 114k budget) - mockCountTokens.mockResolvedValue(150000) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(true) - expect(result.maxChars).toBeDefined() - expect(result.maxChars).toBeGreaterThan(0) - expect(result.reason).toContain("150000 tokens") - expect(result.reason).toContain("114000 tokens available") - }) - - it("should handle case where no budget is available", async () => { - const filePath = "/test/file.txt" - const contextWindow = 200000 - const currentTokens = 200000 // Context is full - - // Mock file stats - large file - mockStat.mockResolvedValue({ - size: 150000, - } as any) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(true) - expect(result.maxChars).toBe(0) - expect(result.reason).toContain("No available context budget") - }) - - it("should handle errors gracefully and not truncate", async () => { - const filePath = "/test/error-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - - // Mock file stats to throw an error - mockStat.mockRejectedValue(new Error("File not found")) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(false) - }) - - it("should calculate correct token budget with 60/40 split", async () => { - const filePath = "/test/file.txt" - const contextWindow = 100000 - const currentTokens = 20000 // 80k remaining - const fileContent = "test content" - - mockStat.mockResolvedValue({ size: 150000 } as any) - mockReadFile.mockResolvedValue(fileContent) - - // Available budget should be: (100000 - 20000) * 0.6 = 48000 - // File uses 50k tokens, should be truncated - mockCountTokens.mockResolvedValue(50000) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(true) - // maxChars should be approximately 48000 * 3 = 144000 - expect(result.maxChars).toBe(144000) - }) - - it("should validate files at the FILE_SIZE_THRESHOLD boundary", async () => { - const filePath = "/test/boundary-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "x".repeat(1000) - - // Mock file stats - exactly at threshold (should trigger validation) - mockStat.mockResolvedValue({ - size: FILE_SIZE_THRESHOLD, - } as any) - - mockReadFile.mockResolvedValue(fileContent) - mockCountTokens.mockResolvedValue(30000) // Within budget - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - // At exactly the threshold, it should validate - expect(mockReadFile).toHaveBeenCalled() - expect(mockCountTokens).toHaveBeenCalled() - expect(result.shouldTruncate).toBe(false) - }) - - it("should provide preview for files exceeding MAX_FILE_SIZE_FOR_TOKENIZATION", async () => { - const filePath = "/test/huge-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const previewContent = "x".repeat(PREVIEW_SIZE_FOR_LARGE_FILES) - - // Mock file stats - file exceeds max tokenization size (e.g., 10MB when max is 5MB) - mockStat.mockResolvedValue({ - size: MAX_FILE_SIZE_FOR_TOKENIZATION + 1000000, // 1MB over the limit - } as any) - - // Mock file.open and read for preview - const mockRead = vi.fn().mockResolvedValue({ - bytesRead: PREVIEW_SIZE_FOR_LARGE_FILES, - }) - const mockClose = vi.fn().mockResolvedValue(undefined) - mockOpen.mockResolvedValue({ - read: mockRead, - close: mockClose, - } as any) - - // Mock token counting for the preview - mockCountTokens.mockResolvedValue(30000) // Preview fits within budget - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - expect(result.shouldTruncate).toBe(true) - expect(result.isPreview).toBe(true) - expect(result.reason).toContain("too large") - expect(result.reason).toContain("preview") - // Should read preview and count tokens - expect(mockOpen).toHaveBeenCalled() - expect(mockCountTokens).toHaveBeenCalled() - }) - - it("should handle files exactly at MAX_FILE_SIZE_FOR_TOKENIZATION boundary", async () => { - const filePath = "/test/boundary-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "x".repeat(1000) - - // Mock file stats - exactly at max size - mockStat.mockResolvedValue({ - size: MAX_FILE_SIZE_FOR_TOKENIZATION, - } as any) - - mockReadFile.mockResolvedValue(fileContent) - mockCountTokens.mockResolvedValue(30000) // Within budget - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - // At exactly the limit, should still attempt to tokenize - expect(mockReadFile).toHaveBeenCalled() - expect(mockCountTokens).toHaveBeenCalled() - }) - - it("should handle tokenizer unreachable errors gracefully", async () => { - const filePath = "/test/problematic-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "x".repeat(200000) // Content that might cause issues - - // Mock file stats - within size limits but content causes tokenizer crash - mockStat.mockResolvedValue({ - size: 200000, - } as any) - - mockReadFile.mockResolvedValue(fileContent) - // Simulate tokenizer "unreachable" error - mockCountTokens.mockRejectedValue(new Error("unreachable")) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - // Should fallback with conservative estimation - const remainingTokens = contextWindow - currentTokens - const safeReadBudget = Math.floor(remainingTokens * 0.6) // 114000 - - expect(result.shouldTruncate).toBe(true) - expect(result.isPreview).toBe(true) - expect(result.reason).toContain("tokenizer error") - - // The actual maxChars depends on conservative estimation - // content.length (200000) is used as estimate since tokenizer failed - expect(result.maxChars).toBeDefined() - expect(typeof result.maxChars).toBe("number") - }) - - it("should handle other tokenizer errors conservatively", async () => { - const filePath = "/test/error-file.txt" - const contextWindow = 200000 - const currentTokens = 10000 - const fileContent = "test content" - - mockStat.mockResolvedValue({ size: 150000 } as any) - mockReadFile.mockResolvedValue(fileContent) - // Simulate a different error - mockCountTokens.mockRejectedValue(new Error("Network error")) - - const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) - - // Should return safe fallback (don't truncate, let normal error handling take over) - expect(result.shouldTruncate).toBe(false) - }) - }) - - describe("truncateFileContent", () => { - it("should truncate content to specified character limit", () => { - const content = "a".repeat(1000) - const maxChars = 500 - const totalChars = 1000 - - const result = truncateFileContent(content, maxChars, totalChars, false) - - expect(result.content).toHaveLength(500) - expect(result.content).toBe("a".repeat(500)) - expect(result.notice).toContain("500 of 1000 characters") - expect(result.notice).toContain("context limitations") - }) - - it("should show preview message for large files", () => { - const content = "x".repeat(10000000) // ~10MB (9.54MB in binary) - const maxChars = 100000 // 100KB preview - const totalChars = 10000000 - - const result = truncateFileContent(content, maxChars, totalChars, true) - - expect(result.content).toHaveLength(maxChars) - expect(result.notice).toContain("Preview") - expect(result.notice).toContain("0.1MB") // 100KB = 0.1MB - expect(result.notice).toContain("9.54MB") // Binary MB calculation - expect(result.notice).toContain("line_range") - }) - - it("should include helpful notice about using line_range", () => { - const content = "test content that is very long" - const maxChars = 10 - const totalChars = 31 - - const result = truncateFileContent(content, maxChars, totalChars) - - expect(result.notice).toContain("line_range") - expect(result.notice).toContain("specific sections") - }) - - it("should handle empty content", () => { - const content = "" - const maxChars = 100 - const totalChars = 0 - - const result = truncateFileContent(content, maxChars, totalChars) - - expect(result.content).toBe("") - expect(result.notice).toContain("0 of 0 characters") - }) - - it("should truncate multi-line content correctly", () => { - const content = "line1\nline2\nline3\nline4\nline5" - const maxChars = 15 - const totalChars = content.length - - const result = truncateFileContent(content, maxChars, totalChars) - - expect(result.content).toBe("line1\nline2\nlin") - expect(result.content).toHaveLength(15) - }) - - it("should work with unicode characters", () => { - const content = "Hello πŸ˜€ World 🌍 Test πŸŽ‰" - const maxChars = 10 - const totalChars = content.length - - const result = truncateFileContent(content, maxChars, totalChars) - - expect(result.content).toHaveLength(10) - expect(result.notice).toBeDefined() - }) - }) -}) diff --git a/src/core/tools/helpers/fileTokenBudget.ts b/src/core/tools/helpers/fileTokenBudget.ts index ad82f8fb410..4023802680f 100644 --- a/src/core/tools/helpers/fileTokenBudget.ts +++ b/src/core/tools/helpers/fileTokenBudget.ts @@ -1,228 +1,9 @@ -import * as fs from "fs/promises" -import { countTokens } from "../../../utils/countTokens" -import { Anthropic } from "@anthropic-ai/sdk" -import { countFileLinesAndTokens } from "../../../integrations/misc/line-counter" - -/** - * File size threshold (in bytes) above which token validation is triggered. - * Files smaller than this are read without token counting overhead. - */ -export const FILE_SIZE_THRESHOLD = 100_000 // 100KB - -/** - * Absolute maximum file size (in bytes) that will be read for token validation. - * Files larger than this cannot be tokenized due to tokenizer limitations. - * This prevents WASM "unreachable" errors in tiktoken. - */ -export const MAX_FILE_SIZE_FOR_TOKENIZATION = 5_000_000 // 5MB - -/** - * Size of preview to read from files that exceed MAX_FILE_SIZE_FOR_TOKENIZATION. - * This allows the agent to see the beginning of large files without crashing. - */ -export const PREVIEW_SIZE_FOR_LARGE_FILES = 100_000 // 100KB +// Re-export the new incremental token-based file reader +export { readFileWithTokenBudget } from "../../../integrations/misc/read-file-with-budget" +export type { ReadWithBudgetResult, ReadWithBudgetOptions } from "../../../integrations/misc/read-file-with-budget" /** * Percentage of available context to reserve for file reading. * The remaining percentage is reserved for the model's response and overhead. */ export const FILE_READ_BUDGET_PERCENT = 0.6 // 60% for file, 40% for response - -/** - * Result of token budget validation for a file. - */ -export interface TokenBudgetResult { - /** Whether the file content should be truncated */ - shouldTruncate: boolean - /** The maximum number of characters allowed (only relevant if shouldTruncate is true) */ - maxChars?: number - /** Human-readable reason for truncation */ - reason?: string - /** Whether this is a preview of a larger file (only showing beginning) */ - isPreview?: boolean -} - -/** - * Validates whether a file's content fits within the available token budget. - * - * Strategy: - * 1. Files < 100KB: Skip validation (fast path) - * 2. Files >= 100KB: Count tokens and check against budget - * 3. Budget = (contextWindow - currentTokens) * 0.6 - * - * @param filePath - Path to the file to validate - * @param contextWindow - Total context window size in tokens - * @param currentTokens - Current token usage - * @returns TokenBudgetResult indicating whether to truncate and at what character limit - */ -export async function validateFileTokenBudget( - filePath: string, - contextWindow: number, - currentTokens: number, -): Promise { - try { - // Check file size first (fast path) - const stats = await fs.stat(filePath) - const fileSizeBytes = stats.size - - // Fast path: small files always pass - if (fileSizeBytes < FILE_SIZE_THRESHOLD) { - return { shouldTruncate: false } - } - - // Calculate available token budget - const remainingTokens = contextWindow - currentTokens - const safeReadBudget = Math.floor(remainingTokens * FILE_READ_BUDGET_PERCENT) - - // If we don't have enough budget, truncate immediately without reading - if (safeReadBudget <= 0) { - return { - shouldTruncate: true, - maxChars: 0, - reason: "No available context budget for file reading", - } - } - - // For files too large to tokenize entirely, read a preview instead - // The tokenizer (tiktoken WASM) crashes with "unreachable" errors on very large files - const isPreviewMode = fileSizeBytes > MAX_FILE_SIZE_FOR_TOKENIZATION - - // Use streaming token counter for normal-sized files to avoid double read - // For previews, still use direct read since we're only reading a portion - let tokenCount = 0 - let streamingSucceeded = false - - if (!isPreviewMode) { - // Try streaming token estimation first (single pass, early exit capability) - try { - const result = await countFileLinesAndTokens(filePath, { - budgetTokens: safeReadBudget, - chunkLines: 256, - }) - tokenCount = result.tokenEstimate - streamingSucceeded = true - - // If streaming indicated we exceeded budget during scan - if (!result.complete) { - // Early exit - we know file exceeds budget without reading it all - const maxChars = Math.floor(safeReadBudget * 3) - return { - shouldTruncate: true, - maxChars, - reason: `File requires ${tokenCount}+ tokens but only ${safeReadBudget} tokens available in context budget`, - } - } - } catch (error) { - // Streaming failed - will fallback to full read below - streamingSucceeded = false - } - } - - // Fallback to full read + token count (for preview mode or if streaming failed) - if (!streamingSucceeded) { - let content: string - - if (isPreviewMode) { - // Read only the preview portion to avoid tokenizer crashes - const fileHandle = await fs.open(filePath, "r") - try { - const buffer = Buffer.alloc(PREVIEW_SIZE_FOR_LARGE_FILES) - const { bytesRead } = await fileHandle.read(buffer, 0, PREVIEW_SIZE_FOR_LARGE_FILES, 0) - content = buffer.slice(0, bytesRead).toString("utf-8") - } finally { - await fileHandle.close() - } - } else { - // Read the entire file for normal-sized files - content = await fs.readFile(filePath, "utf-8") - } - - // Count tokens with error handling - try { - const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text: content }] - tokenCount = await countTokens(contentBlocks) - } catch (error) { - // Catch tokenizer "unreachable" errors - const errorMessage = error instanceof Error ? error.message : String(error) - if (errorMessage.includes("unreachable")) { - // Use conservative estimation: 2 chars = 1 token - const estimatedTokens = Math.ceil(content.length / 2) - if (estimatedTokens > safeReadBudget) { - return { - shouldTruncate: true, - maxChars: safeReadBudget, - isPreview: true, - reason: `File content caused tokenizer error. Showing truncated preview to fit context budget. Use line_range to read specific sections.`, - } - } - return { - shouldTruncate: true, - maxChars: content.length, - isPreview: true, - reason: `File content caused tokenizer error but fits in context. Use line_range for specific sections.`, - } - } - throw error - } - } - - // Check if content exceeds budget - if (tokenCount > safeReadBudget) { - const maxChars = Math.floor(safeReadBudget * 3) - return { - shouldTruncate: true, - maxChars, - isPreview: isPreviewMode, - reason: isPreviewMode - ? `Preview of large file (${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB) truncated to fit context budget. Use line_range to read specific sections.` - : `File requires ${tokenCount} tokens but only ${safeReadBudget} tokens available in context budget`, - } - } - - // Content fits within budget - if (isPreviewMode) { - return { - shouldTruncate: true, - maxChars: PREVIEW_SIZE_FOR_LARGE_FILES, - isPreview: true, - reason: `File is too large (${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB) to read entirely. Showing preview of first ${(PREVIEW_SIZE_FOR_LARGE_FILES / 1024 / 1024).toFixed(1)}MB. Use line_range to read specific sections.`, - } - } - - // File fits within budget - return { shouldTruncate: false } - } catch (error) { - // On error, be conservative and don't truncate - // This allows the existing error handling to take over - console.warn(`[fileTokenBudget] Error validating file ${filePath}:`, error) - return { shouldTruncate: false } - } -} - -/** - * Truncates file content to fit within the specified character limit. - * Adds a notice message at the end to inform the user about truncation. - * - * @param content - The full file content - * @param maxChars - Maximum number of characters to keep - * @param totalChars - Total number of characters in the original file - * @param isPreview - Whether this is a preview of a larger file (not token-budget limited) - * @returns Object containing truncated content and a notice message - */ -export function truncateFileContent( - content: string, - maxChars: number, - totalChars: number, - isPreview: boolean = false, -): { content: string; notice: string } { - const truncatedContent = content.slice(0, maxChars) - - const notice = isPreview - ? `Preview: Showing first ${(maxChars / 1024 / 1024).toFixed(1)}MB of ${(totalChars / 1024 / 1024).toFixed(2)}MB file. Use line_range to read specific sections.` - : `File truncated to ${maxChars} of ${totalChars} characters due to context limitations. Use line_range to read specific sections if needed.` - - return { - content: truncatedContent, - notice, - } -} diff --git a/src/integrations/misc/__tests__/read-file-with-budget.spec.ts b/src/integrations/misc/__tests__/read-file-with-budget.spec.ts new file mode 100644 index 00000000000..7a4e99ce694 --- /dev/null +++ b/src/integrations/misc/__tests__/read-file-with-budget.spec.ts @@ -0,0 +1,321 @@ +import fs from "fs/promises" +import path from "path" +import os from "os" +import { readFileWithTokenBudget } from "../read-file-with-budget" + +describe("readFileWithTokenBudget", () => { + let tempDir: string + + beforeEach(async () => { + // Create a temporary directory for test files + tempDir = path.join(os.tmpdir(), `read-file-budget-test-${Date.now()}`) + await fs.mkdir(tempDir, { recursive: true }) + }) + + afterEach(async () => { + // Clean up temporary directory + await fs.rm(tempDir, { recursive: true, force: true }) + }) + + describe("Basic functionality", () => { + test("reads entire small file when within budget", async () => { + const filePath = path.join(tempDir, "small.txt") + const content = "Line 1\nLine 2\nLine 3" + await fs.writeFile(filePath, content) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, // Large budget + }) + + expect(result.content).toBe(content) + expect(result.lineCount).toBe(3) + expect(result.complete).toBe(true) + expect(result.tokenCount).toBeGreaterThan(0) + expect(result.tokenCount).toBeLessThan(1000) + }) + + test("returns correct token count", async () => { + const filePath = path.join(tempDir, "token-test.txt") + const content = "This is a test file with some content." + await fs.writeFile(filePath, content) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + // Token count should be reasonable (rough estimate: 1 token per 3-4 chars) + expect(result.tokenCount).toBeGreaterThan(5) + expect(result.tokenCount).toBeLessThan(20) + }) + + test("returns complete: true for files within budget", async () => { + const filePath = path.join(tempDir, "within-budget.txt") + const lines = Array.from({ length: 10 }, (_, i) => `Line ${i + 1}`) + await fs.writeFile(filePath, lines.join("\n")) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + expect(result.complete).toBe(true) + expect(result.lineCount).toBe(10) + }) + }) + + describe("Truncation behavior", () => { + test("stops reading when token budget reached", async () => { + const filePath = path.join(tempDir, "large.txt") + // Create a file with many lines + const lines = Array.from({ length: 1000 }, (_, i) => `This is line number ${i + 1} with some content`) + await fs.writeFile(filePath, lines.join("\n")) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 50, // Small budget + }) + + expect(result.complete).toBe(false) + expect(result.lineCount).toBeLessThan(1000) + expect(result.lineCount).toBeGreaterThan(0) + expect(result.tokenCount).toBeLessThanOrEqual(50) + }) + + test("returns complete: false when truncated", async () => { + const filePath = path.join(tempDir, "truncated.txt") + const lines = Array.from({ length: 500 }, (_, i) => `Line ${i + 1}`) + await fs.writeFile(filePath, lines.join("\n")) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 20, + }) + + expect(result.complete).toBe(false) + expect(result.tokenCount).toBeLessThanOrEqual(20) + }) + + test("content ends at line boundary (no partial lines)", async () => { + const filePath = path.join(tempDir, "line-boundary.txt") + const lines = Array.from({ length: 100 }, (_, i) => `Line ${i + 1}`) + await fs.writeFile(filePath, lines.join("\n")) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 30, + }) + + // Content should not end mid-line + const contentLines = result.content.split("\n") + expect(contentLines.length).toBe(result.lineCount) + // Last line should be complete (not cut off) + expect(contentLines[contentLines.length - 1]).toMatch(/^Line \d+$/) + }) + + test("works with different chunk sizes", async () => { + const filePath = path.join(tempDir, "chunks.txt") + const lines = Array.from({ length: 1000 }, (_, i) => `Line ${i + 1}`) + await fs.writeFile(filePath, lines.join("\n")) + + // Test with small chunk size + const result1 = await readFileWithTokenBudget(filePath, { + budgetTokens: 50, + chunkLines: 10, + }) + + // Test with large chunk size + const result2 = await readFileWithTokenBudget(filePath, { + budgetTokens: 50, + chunkLines: 500, + }) + + // Both should truncate, but may differ slightly in exact line count + expect(result1.complete).toBe(false) + expect(result2.complete).toBe(false) + expect(result1.tokenCount).toBeLessThanOrEqual(50) + expect(result2.tokenCount).toBeLessThanOrEqual(50) + }) + }) + + describe("Edge cases", () => { + test("handles empty file", async () => { + const filePath = path.join(tempDir, "empty.txt") + await fs.writeFile(filePath, "") + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 100, + }) + + expect(result.content).toBe("") + expect(result.lineCount).toBe(0) + expect(result.tokenCount).toBe(0) + expect(result.complete).toBe(true) + }) + + test("handles single line file", async () => { + const filePath = path.join(tempDir, "single-line.txt") + await fs.writeFile(filePath, "Single line content") + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 100, + }) + + expect(result.content).toBe("Single line content") + expect(result.lineCount).toBe(1) + expect(result.complete).toBe(true) + }) + + test("handles budget of 0 tokens", async () => { + const filePath = path.join(tempDir, "zero-budget.txt") + await fs.writeFile(filePath, "Line 1\nLine 2\nLine 3") + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 0, + }) + + expect(result.content).toBe("") + expect(result.lineCount).toBe(0) + expect(result.tokenCount).toBe(0) + expect(result.complete).toBe(false) + }) + + test("handles very small budget (fewer tokens than first line)", async () => { + const filePath = path.join(tempDir, "tiny-budget.txt") + const longLine = "This is a very long line with lots of content that will exceed a tiny token budget" + await fs.writeFile(filePath, `${longLine}\nLine 2\nLine 3`) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 2, // Very small budget + }) + + // Should return empty since first line exceeds budget + expect(result.content).toBe("") + expect(result.lineCount).toBe(0) + expect(result.complete).toBe(false) + }) + + test("throws error for non-existent file", async () => { + const filePath = path.join(tempDir, "does-not-exist.txt") + + await expect( + readFileWithTokenBudget(filePath, { + budgetTokens: 100, + }), + ).rejects.toThrow("File not found") + }) + + test("handles file with no trailing newline", async () => { + const filePath = path.join(tempDir, "no-trailing-newline.txt") + await fs.writeFile(filePath, "Line 1\nLine 2\nLine 3") + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + expect(result.content).toBe("Line 1\nLine 2\nLine 3") + expect(result.lineCount).toBe(3) + expect(result.complete).toBe(true) + }) + + test("handles file with trailing newline", async () => { + const filePath = path.join(tempDir, "trailing-newline.txt") + await fs.writeFile(filePath, "Line 1\nLine 2\nLine 3\n") + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + expect(result.content).toBe("Line 1\nLine 2\nLine 3") + expect(result.lineCount).toBe(3) + expect(result.complete).toBe(true) + }) + }) + + describe("Token counting accuracy", () => { + test("returned tokenCount matches actual tokens in content", async () => { + const filePath = path.join(tempDir, "accuracy.txt") + const content = "Hello world\nThis is a test\nWith some content" + await fs.writeFile(filePath, content) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + // Verify the token count is reasonable + // Rough estimate: 1 token per 3-4 characters + const minExpected = Math.floor(content.length / 5) + const maxExpected = Math.ceil(content.length / 2) + + expect(result.tokenCount).toBeGreaterThanOrEqual(minExpected) + expect(result.tokenCount).toBeLessThanOrEqual(maxExpected) + }) + + test("handles special characters correctly", async () => { + const filePath = path.join(tempDir, "special-chars.txt") + const content = "Special chars: @#$%^&*()\nUnicode: δ½ ε₯½δΈ–η•Œ\nEmoji: πŸ˜€πŸŽ‰" + await fs.writeFile(filePath, content) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + expect(result.content).toBe(content) + expect(result.tokenCount).toBeGreaterThan(0) + expect(result.complete).toBe(true) + }) + + test("handles code content", async () => { + const filePath = path.join(tempDir, "code.ts") + const code = `function hello(name: string): string {\n return \`Hello, \${name}!\`\n}` + await fs.writeFile(filePath, code) + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 1000, + }) + + expect(result.content).toBe(code) + expect(result.tokenCount).toBeGreaterThan(0) + expect(result.complete).toBe(true) + }) + }) + + describe("Performance", () => { + test("handles large files efficiently", async () => { + const filePath = path.join(tempDir, "large-file.txt") + // Create a 1MB file + const lines = Array.from({ length: 10000 }, (_, i) => `Line ${i + 1} with some additional content`) + await fs.writeFile(filePath, lines.join("\n")) + + const startTime = Date.now() + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 100, + }) + + const endTime = Date.now() + const duration = endTime - startTime + + // Should complete in reasonable time (less than 5 seconds) + expect(duration).toBeLessThan(5000) + expect(result.complete).toBe(false) + expect(result.tokenCount).toBeLessThanOrEqual(100) + }) + + test("early exits when budget is reached", async () => { + const filePath = path.join(tempDir, "early-exit.txt") + // Create a very large file + const lines = Array.from({ length: 50000 }, (_, i) => `Line ${i + 1}`) + await fs.writeFile(filePath, lines.join("\n")) + + const startTime = Date.now() + + const result = await readFileWithTokenBudget(filePath, { + budgetTokens: 50, // Small budget should trigger early exit + }) + + const endTime = Date.now() + const duration = endTime - startTime + + // Should be much faster than reading entire file (less than 2 seconds) + expect(duration).toBeLessThan(2000) + expect(result.complete).toBe(false) + expect(result.lineCount).toBeLessThan(50000) + }) + }) +}) diff --git a/src/integrations/misc/read-file-with-budget.ts b/src/integrations/misc/read-file-with-budget.ts new file mode 100644 index 00000000000..15aa4f1144f --- /dev/null +++ b/src/integrations/misc/read-file-with-budget.ts @@ -0,0 +1,182 @@ +import { createReadStream } from "fs" +import fs from "fs/promises" +import { createInterface } from "readline" +import { countTokens } from "../../utils/countTokens" +import { Anthropic } from "@anthropic-ai/sdk" + +export interface ReadWithBudgetResult { + /** The content read up to the token budget */ + content: string + /** Actual token count of returned content */ + tokenCount: number + /** Total lines in the returned content */ + lineCount: number + /** Whether the entire file was read (false if truncated) */ + complete: boolean +} + +export interface ReadWithBudgetOptions { + /** Maximum tokens allowed. Required. */ + budgetTokens: number + /** Number of lines to buffer before token counting (default: 256) */ + chunkLines?: number +} + +/** + * Reads a file while incrementally counting tokens, stopping when budget is reached. + * + * Unlike validateFileTokenBudget + extractTextFromFile, this is a single-pass + * operation that returns the actual content up to the token limit. + * + * @param filePath - Path to the file to read + * @param options - Budget and chunking options + * @returns Content read, token count, and completion status + */ +export async function readFileWithTokenBudget( + filePath: string, + options: ReadWithBudgetOptions, +): Promise { + const { budgetTokens, chunkLines = 256 } = options + + // Verify file exists + try { + await fs.access(filePath) + } catch { + throw new Error(`File not found: ${filePath}`) + } + + return new Promise((resolve, reject) => { + let content = "" + let lineCount = 0 + let tokenCount = 0 + let lineBuffer: string[] = [] + let complete = true + let isProcessing = false + let shouldClose = false + + const readStream = createReadStream(filePath) + const rl = createInterface({ + input: readStream, + crlfDelay: Infinity, + }) + + const processBuffer = async (): Promise => { + if (lineBuffer.length === 0) return true + + const bufferText = lineBuffer.join("\n") + const currentBuffer = [...lineBuffer] + lineBuffer = [] + + // Count tokens for this chunk + let chunkTokens: number + try { + const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text: bufferText }] + chunkTokens = await countTokens(contentBlocks) + } catch { + // Fallback: conservative estimate (2 chars per token) + chunkTokens = Math.ceil(bufferText.length / 2) + } + + // Check if adding this chunk would exceed budget + if (tokenCount + chunkTokens > budgetTokens) { + // Need to find cutoff within this chunk using binary search + let low = 0 + let high = currentBuffer.length + let bestFit = 0 + let bestTokens = 0 + + while (low < high) { + const mid = Math.floor((low + high + 1) / 2) + const testContent = currentBuffer.slice(0, mid).join("\n") + let testTokens: number + try { + const blocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text: testContent }] + testTokens = await countTokens(blocks) + } catch { + testTokens = Math.ceil(testContent.length / 2) + } + + if (tokenCount + testTokens <= budgetTokens) { + bestFit = mid + bestTokens = testTokens + low = mid + } else { + high = mid - 1 + } + } + + // Add best fit lines + if (bestFit > 0) { + const fitContent = currentBuffer.slice(0, bestFit).join("\n") + content += (content.length > 0 ? "\n" : "") + fitContent + tokenCount += bestTokens + lineCount += bestFit + } + complete = false + return false + } + + // Entire chunk fits - add it all + content += (content.length > 0 ? "\n" : "") + bufferText + tokenCount += chunkTokens + lineCount += currentBuffer.length + return true + } + + rl.on("line", (line) => { + lineBuffer.push(line) + + if (lineBuffer.length >= chunkLines && !isProcessing) { + isProcessing = true + rl.pause() + + processBuffer() + .then((continueReading) => { + isProcessing = false + if (!continueReading) { + shouldClose = true + rl.close() + readStream.destroy() + } else if (!shouldClose) { + rl.resume() + } + }) + .catch((err) => { + isProcessing = false + shouldClose = true + rl.close() + readStream.destroy() + reject(err) + }) + } + }) + + rl.on("close", async () => { + // Wait for any ongoing processing with timeout + const maxWaitTime = 30000 // 30 seconds + const startWait = Date.now() + while (isProcessing) { + if (Date.now() - startWait > maxWaitTime) { + reject(new Error("Timeout waiting for buffer processing to complete")) + return + } + await new Promise((r) => setTimeout(r, 10)) + } + + // Process remaining buffer + if (!shouldClose) { + try { + await processBuffer() + } catch (err) { + reject(err) + return + } + } + + resolve({ content, tokenCount, lineCount, complete }) + }) + + rl.on("error", reject) + readStream.on("error", reject) + }) +}