diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index 7ba822dce0f..e02140b72c4 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -201,10 +201,13 @@ function createMockCline(): any { recordToolUsage: vi.fn().mockReturnValue(undefined), recordToolError: vi.fn().mockReturnValue(undefined), didRejectTool: false, + getTokenUsage: vi.fn().mockReturnValue({ + contextTokens: 10000, + }), // CRITICAL: Always ensure image support is enabled api: { getModel: vi.fn().mockReturnValue({ - info: { supportsImages: true }, + info: { supportsImages: true, contextWindow: 200000 }, }), }, } diff --git a/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts b/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts new file mode 100644 index 00000000000..4eea6435a89 --- /dev/null +++ b/src/core/tools/helpers/__tests__/fileTokenBudget.spec.ts @@ -0,0 +1,357 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest" +import { + validateFileTokenBudget, + truncateFileContent, + FILE_SIZE_THRESHOLD, + MAX_FILE_SIZE_FOR_TOKENIZATION, + PREVIEW_SIZE_FOR_LARGE_FILES, +} from "../fileTokenBudget" + +// Mock dependencies +vi.mock("fs/promises", () => ({ + stat: vi.fn(), + readFile: vi.fn(), + open: vi.fn(), +})) + +vi.mock("../../../../utils/countTokens", () => ({ + countTokens: vi.fn(), +})) + +// Import after mocking +const fs = await import("fs/promises") +const { countTokens } = await import("../../../../utils/countTokens") + +const mockStat = vi.mocked(fs.stat) +const mockReadFile = vi.mocked(fs.readFile) +const mockOpen = vi.mocked(fs.open) +const mockCountTokens = vi.mocked(countTokens) + +describe("fileTokenBudget", () => { + beforeEach(() => { + vi.clearAllMocks() + mockOpen.mockReset() + }) + + afterEach(() => { + vi.restoreAllMocks() + }) + + describe("validateFileTokenBudget", () => { + it("should not truncate files smaller than FILE_SIZE_THRESHOLD", async () => { + const filePath = "/test/small-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + + // Mock file stats - small file (50KB) + mockStat.mockResolvedValue({ + size: 50000, + } as any) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(false) + expect(mockReadFile).not.toHaveBeenCalled() + expect(mockCountTokens).not.toHaveBeenCalled() + }) + + it("should validate and not truncate large files that fit within budget", async () => { + const filePath = "/test/large-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "x".repeat(150000) // 150KB file + + // Mock file stats - large file (150KB) + mockStat.mockResolvedValue({ + size: 150000, + } as any) + + // Mock file read + mockReadFile.mockResolvedValue(fileContent) + + // Mock token counting - file uses 30k tokens (within 60% of 190k remaining = 114k budget) + mockCountTokens.mockResolvedValue(30000) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(false) + expect(mockReadFile).toHaveBeenCalledWith(filePath, "utf-8") + expect(mockCountTokens).toHaveBeenCalled() + }) + + it("should truncate large files that exceed token budget", async () => { + const filePath = "/test/huge-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "x".repeat(500000) // 500KB file + + // Mock file stats - huge file (500KB) + mockStat.mockResolvedValue({ + size: 500000, + } as any) + + // Mock file read + mockReadFile.mockResolvedValue(fileContent) + + // Mock token counting - file uses 150k tokens (exceeds 60% of 190k remaining = 114k budget) + mockCountTokens.mockResolvedValue(150000) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(true) + expect(result.maxChars).toBeDefined() + expect(result.maxChars).toBeGreaterThan(0) + expect(result.reason).toContain("150000 tokens") + expect(result.reason).toContain("114000 tokens available") + }) + + it("should handle case where no budget is available", async () => { + const filePath = "/test/file.txt" + const contextWindow = 200000 + const currentTokens = 200000 // Context is full + + // Mock file stats - large file + mockStat.mockResolvedValue({ + size: 150000, + } as any) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(true) + expect(result.maxChars).toBe(0) + expect(result.reason).toContain("No available context budget") + }) + + it("should handle errors gracefully and not truncate", async () => { + const filePath = "/test/error-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + + // Mock file stats to throw an error + mockStat.mockRejectedValue(new Error("File not found")) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(false) + }) + + it("should calculate correct token budget with 60/40 split", async () => { + const filePath = "/test/file.txt" + const contextWindow = 100000 + const currentTokens = 20000 // 80k remaining + const fileContent = "test content" + + mockStat.mockResolvedValue({ size: 150000 } as any) + mockReadFile.mockResolvedValue(fileContent) + + // Available budget should be: (100000 - 20000) * 0.6 = 48000 + // File uses 50k tokens, should be truncated + mockCountTokens.mockResolvedValue(50000) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(true) + // maxChars should be approximately 48000 * 3 = 144000 + expect(result.maxChars).toBe(144000) + }) + + it("should validate files at the FILE_SIZE_THRESHOLD boundary", async () => { + const filePath = "/test/boundary-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "x".repeat(1000) + + // Mock file stats - exactly at threshold (should trigger validation) + mockStat.mockResolvedValue({ + size: FILE_SIZE_THRESHOLD, + } as any) + + mockReadFile.mockResolvedValue(fileContent) + mockCountTokens.mockResolvedValue(30000) // Within budget + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + // At exactly the threshold, it should validate + expect(mockReadFile).toHaveBeenCalled() + expect(mockCountTokens).toHaveBeenCalled() + expect(result.shouldTruncate).toBe(false) + }) + + it("should provide preview for files exceeding MAX_FILE_SIZE_FOR_TOKENIZATION", async () => { + const filePath = "/test/huge-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const previewContent = "x".repeat(PREVIEW_SIZE_FOR_LARGE_FILES) + + // Mock file stats - file exceeds max tokenization size (e.g., 10MB when max is 5MB) + mockStat.mockResolvedValue({ + size: MAX_FILE_SIZE_FOR_TOKENIZATION + 1000000, // 1MB over the limit + } as any) + + // Mock file.open and read for preview + const mockRead = vi.fn().mockResolvedValue({ + bytesRead: PREVIEW_SIZE_FOR_LARGE_FILES, + }) + const mockClose = vi.fn().mockResolvedValue(undefined) + mockOpen.mockResolvedValue({ + read: mockRead, + close: mockClose, + } as any) + + // Mock token counting for the preview + mockCountTokens.mockResolvedValue(30000) // Preview fits within budget + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + expect(result.shouldTruncate).toBe(true) + expect(result.isPreview).toBe(true) + expect(result.reason).toContain("too large") + expect(result.reason).toContain("preview") + // Should read preview and count tokens + expect(mockOpen).toHaveBeenCalled() + expect(mockCountTokens).toHaveBeenCalled() + }) + + it("should handle files exactly at MAX_FILE_SIZE_FOR_TOKENIZATION boundary", async () => { + const filePath = "/test/boundary-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "x".repeat(1000) + + // Mock file stats - exactly at max size + mockStat.mockResolvedValue({ + size: MAX_FILE_SIZE_FOR_TOKENIZATION, + } as any) + + mockReadFile.mockResolvedValue(fileContent) + mockCountTokens.mockResolvedValue(30000) // Within budget + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + // At exactly the limit, should still attempt to tokenize + expect(mockReadFile).toHaveBeenCalled() + expect(mockCountTokens).toHaveBeenCalled() + }) + + it("should handle tokenizer unreachable errors gracefully", async () => { + const filePath = "/test/problematic-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "x".repeat(200000) // Content that might cause issues + + // Mock file stats - within size limits but content causes tokenizer crash + mockStat.mockResolvedValue({ + size: 200000, + } as any) + + mockReadFile.mockResolvedValue(fileContent) + // Simulate tokenizer "unreachable" error + mockCountTokens.mockRejectedValue(new Error("unreachable")) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + // Should fallback with conservative estimation + const remainingTokens = contextWindow - currentTokens + const safeReadBudget = Math.floor(remainingTokens * 0.6) // 114000 + + expect(result.shouldTruncate).toBe(true) + expect(result.isPreview).toBe(true) + expect(result.reason).toContain("tokenizer error") + + // The actual maxChars depends on conservative estimation + // content.length (200000) is used as estimate since tokenizer failed + expect(result.maxChars).toBeDefined() + expect(typeof result.maxChars).toBe("number") + }) + + it("should handle other tokenizer errors conservatively", async () => { + const filePath = "/test/error-file.txt" + const contextWindow = 200000 + const currentTokens = 10000 + const fileContent = "test content" + + mockStat.mockResolvedValue({ size: 150000 } as any) + mockReadFile.mockResolvedValue(fileContent) + // Simulate a different error + mockCountTokens.mockRejectedValue(new Error("Network error")) + + const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens) + + // Should return safe fallback (don't truncate, let normal error handling take over) + expect(result.shouldTruncate).toBe(false) + }) + }) + + describe("truncateFileContent", () => { + it("should truncate content to specified character limit", () => { + const content = "a".repeat(1000) + const maxChars = 500 + const totalChars = 1000 + + const result = truncateFileContent(content, maxChars, totalChars, false) + + expect(result.content).toHaveLength(500) + expect(result.content).toBe("a".repeat(500)) + expect(result.notice).toContain("500 of 1000 characters") + expect(result.notice).toContain("context limitations") + }) + + it("should show preview message for large files", () => { + const content = "x".repeat(10000000) // ~10MB (9.54MB in binary) + const maxChars = 100000 // 100KB preview + const totalChars = 10000000 + + const result = truncateFileContent(content, maxChars, totalChars, true) + + expect(result.content).toHaveLength(maxChars) + expect(result.notice).toContain("Preview") + expect(result.notice).toContain("0.1MB") // 100KB = 0.1MB + expect(result.notice).toContain("9.54MB") // Binary MB calculation + expect(result.notice).toContain("line_range") + }) + + it("should include helpful notice about using line_range", () => { + const content = "test content that is very long" + const maxChars = 10 + const totalChars = 31 + + const result = truncateFileContent(content, maxChars, totalChars) + + expect(result.notice).toContain("line_range") + expect(result.notice).toContain("specific sections") + }) + + it("should handle empty content", () => { + const content = "" + const maxChars = 100 + const totalChars = 0 + + const result = truncateFileContent(content, maxChars, totalChars) + + expect(result.content).toBe("") + expect(result.notice).toContain("0 of 0 characters") + }) + + it("should truncate multi-line content correctly", () => { + const content = "line1\nline2\nline3\nline4\nline5" + const maxChars = 15 + const totalChars = content.length + + const result = truncateFileContent(content, maxChars, totalChars) + + expect(result.content).toBe("line1\nline2\nlin") + expect(result.content).toHaveLength(15) + }) + + it("should work with unicode characters", () => { + const content = "Hello 😀 World 🌍 Test 🎉" + const maxChars = 10 + const totalChars = content.length + + const result = truncateFileContent(content, maxChars, totalChars) + + expect(result.content).toHaveLength(10) + expect(result.notice).toBeDefined() + }) + }) +}) diff --git a/src/core/tools/helpers/fileTokenBudget.ts b/src/core/tools/helpers/fileTokenBudget.ts new file mode 100644 index 00000000000..ad82f8fb410 --- /dev/null +++ b/src/core/tools/helpers/fileTokenBudget.ts @@ -0,0 +1,228 @@ +import * as fs from "fs/promises" +import { countTokens } from "../../../utils/countTokens" +import { Anthropic } from "@anthropic-ai/sdk" +import { countFileLinesAndTokens } from "../../../integrations/misc/line-counter" + +/** + * File size threshold (in bytes) above which token validation is triggered. + * Files smaller than this are read without token counting overhead. + */ +export const FILE_SIZE_THRESHOLD = 100_000 // 100KB + +/** + * Absolute maximum file size (in bytes) that will be read for token validation. + * Files larger than this cannot be tokenized due to tokenizer limitations. + * This prevents WASM "unreachable" errors in tiktoken. + */ +export const MAX_FILE_SIZE_FOR_TOKENIZATION = 5_000_000 // 5MB + +/** + * Size of preview to read from files that exceed MAX_FILE_SIZE_FOR_TOKENIZATION. + * This allows the agent to see the beginning of large files without crashing. + */ +export const PREVIEW_SIZE_FOR_LARGE_FILES = 100_000 // 100KB + +/** + * Percentage of available context to reserve for file reading. + * The remaining percentage is reserved for the model's response and overhead. + */ +export const FILE_READ_BUDGET_PERCENT = 0.6 // 60% for file, 40% for response + +/** + * Result of token budget validation for a file. + */ +export interface TokenBudgetResult { + /** Whether the file content should be truncated */ + shouldTruncate: boolean + /** The maximum number of characters allowed (only relevant if shouldTruncate is true) */ + maxChars?: number + /** Human-readable reason for truncation */ + reason?: string + /** Whether this is a preview of a larger file (only showing beginning) */ + isPreview?: boolean +} + +/** + * Validates whether a file's content fits within the available token budget. + * + * Strategy: + * 1. Files < 100KB: Skip validation (fast path) + * 2. Files >= 100KB: Count tokens and check against budget + * 3. Budget = (contextWindow - currentTokens) * 0.6 + * + * @param filePath - Path to the file to validate + * @param contextWindow - Total context window size in tokens + * @param currentTokens - Current token usage + * @returns TokenBudgetResult indicating whether to truncate and at what character limit + */ +export async function validateFileTokenBudget( + filePath: string, + contextWindow: number, + currentTokens: number, +): Promise { + try { + // Check file size first (fast path) + const stats = await fs.stat(filePath) + const fileSizeBytes = stats.size + + // Fast path: small files always pass + if (fileSizeBytes < FILE_SIZE_THRESHOLD) { + return { shouldTruncate: false } + } + + // Calculate available token budget + const remainingTokens = contextWindow - currentTokens + const safeReadBudget = Math.floor(remainingTokens * FILE_READ_BUDGET_PERCENT) + + // If we don't have enough budget, truncate immediately without reading + if (safeReadBudget <= 0) { + return { + shouldTruncate: true, + maxChars: 0, + reason: "No available context budget for file reading", + } + } + + // For files too large to tokenize entirely, read a preview instead + // The tokenizer (tiktoken WASM) crashes with "unreachable" errors on very large files + const isPreviewMode = fileSizeBytes > MAX_FILE_SIZE_FOR_TOKENIZATION + + // Use streaming token counter for normal-sized files to avoid double read + // For previews, still use direct read since we're only reading a portion + let tokenCount = 0 + let streamingSucceeded = false + + if (!isPreviewMode) { + // Try streaming token estimation first (single pass, early exit capability) + try { + const result = await countFileLinesAndTokens(filePath, { + budgetTokens: safeReadBudget, + chunkLines: 256, + }) + tokenCount = result.tokenEstimate + streamingSucceeded = true + + // If streaming indicated we exceeded budget during scan + if (!result.complete) { + // Early exit - we know file exceeds budget without reading it all + const maxChars = Math.floor(safeReadBudget * 3) + return { + shouldTruncate: true, + maxChars, + reason: `File requires ${tokenCount}+ tokens but only ${safeReadBudget} tokens available in context budget`, + } + } + } catch (error) { + // Streaming failed - will fallback to full read below + streamingSucceeded = false + } + } + + // Fallback to full read + token count (for preview mode or if streaming failed) + if (!streamingSucceeded) { + let content: string + + if (isPreviewMode) { + // Read only the preview portion to avoid tokenizer crashes + const fileHandle = await fs.open(filePath, "r") + try { + const buffer = Buffer.alloc(PREVIEW_SIZE_FOR_LARGE_FILES) + const { bytesRead } = await fileHandle.read(buffer, 0, PREVIEW_SIZE_FOR_LARGE_FILES, 0) + content = buffer.slice(0, bytesRead).toString("utf-8") + } finally { + await fileHandle.close() + } + } else { + // Read the entire file for normal-sized files + content = await fs.readFile(filePath, "utf-8") + } + + // Count tokens with error handling + try { + const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text: content }] + tokenCount = await countTokens(contentBlocks) + } catch (error) { + // Catch tokenizer "unreachable" errors + const errorMessage = error instanceof Error ? error.message : String(error) + if (errorMessage.includes("unreachable")) { + // Use conservative estimation: 2 chars = 1 token + const estimatedTokens = Math.ceil(content.length / 2) + if (estimatedTokens > safeReadBudget) { + return { + shouldTruncate: true, + maxChars: safeReadBudget, + isPreview: true, + reason: `File content caused tokenizer error. Showing truncated preview to fit context budget. Use line_range to read specific sections.`, + } + } + return { + shouldTruncate: true, + maxChars: content.length, + isPreview: true, + reason: `File content caused tokenizer error but fits in context. Use line_range for specific sections.`, + } + } + throw error + } + } + + // Check if content exceeds budget + if (tokenCount > safeReadBudget) { + const maxChars = Math.floor(safeReadBudget * 3) + return { + shouldTruncate: true, + maxChars, + isPreview: isPreviewMode, + reason: isPreviewMode + ? `Preview of large file (${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB) truncated to fit context budget. Use line_range to read specific sections.` + : `File requires ${tokenCount} tokens but only ${safeReadBudget} tokens available in context budget`, + } + } + + // Content fits within budget + if (isPreviewMode) { + return { + shouldTruncate: true, + maxChars: PREVIEW_SIZE_FOR_LARGE_FILES, + isPreview: true, + reason: `File is too large (${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB) to read entirely. Showing preview of first ${(PREVIEW_SIZE_FOR_LARGE_FILES / 1024 / 1024).toFixed(1)}MB. Use line_range to read specific sections.`, + } + } + + // File fits within budget + return { shouldTruncate: false } + } catch (error) { + // On error, be conservative and don't truncate + // This allows the existing error handling to take over + console.warn(`[fileTokenBudget] Error validating file ${filePath}:`, error) + return { shouldTruncate: false } + } +} + +/** + * Truncates file content to fit within the specified character limit. + * Adds a notice message at the end to inform the user about truncation. + * + * @param content - The full file content + * @param maxChars - Maximum number of characters to keep + * @param totalChars - Total number of characters in the original file + * @param isPreview - Whether this is a preview of a larger file (not token-budget limited) + * @returns Object containing truncated content and a notice message + */ +export function truncateFileContent( + content: string, + maxChars: number, + totalChars: number, + isPreview: boolean = false, +): { content: string; notice: string } { + const truncatedContent = content.slice(0, maxChars) + + const notice = isPreview + ? `Preview: Showing first ${(maxChars / 1024 / 1024).toFixed(1)}MB of ${(totalChars / 1024 / 1024).toFixed(2)}MB file. Use line_range to read specific sections.` + : `File truncated to ${maxChars} of ${totalChars} characters due to context limitations. Use line_range to read specific sections if needed.` + + return { + content: truncatedContent, + notice, + } +} diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 01427f4d9dc..6223d61f87c 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -22,6 +22,7 @@ import { processImageFile, ImageMemoryTracker, } from "./helpers/imageHelpers" +import { validateFileTokenBudget, truncateFileContent } from "./helpers/fileTokenBudget" export function getReadFileToolDescription(blockName: string, blockParams: any): string { // Handle both single path and multiple files via args @@ -594,13 +595,43 @@ export async function readFileTool( continue } - // Handle normal file read - const content = await extractTextFromFile(fullPath) - const lineRangeAttr = ` lines="1-${totalLines}"` - let xmlInfo = totalLines > 0 ? `\n${content}\n` : `` + // Handle normal file read with token budget validation + const modelInfo = cline.api.getModel().info + const { contextTokens } = cline.getTokenUsage() + const contextWindow = modelInfo.contextWindow - if (totalLines === 0) { - xmlInfo += `File is empty\n` + // Validate if file fits within token budget + const budgetResult = await validateFileTokenBudget(fullPath, contextWindow, contextTokens || 0) + + let content = await extractTextFromFile(fullPath) + let xmlInfo = "" + + if (budgetResult.shouldTruncate && budgetResult.maxChars !== undefined) { + // Truncate the content to fit budget or show preview for large files + const truncateResult = truncateFileContent( + content, + budgetResult.maxChars, + content.length, + budgetResult.isPreview, + ) + content = truncateResult.content + + // Reflect actual displayed line count after truncation (count ALL lines, including empty) + // Handle trailing newline: "line1\nline2\n" should be 2 lines, not 3 + let displayedLines = content.length === 0 ? 0 : content.split(/\r?\n/).length + if (displayedLines > 0 && content.endsWith("\n")) { + displayedLines-- + } + const lineRangeAttr = displayedLines > 0 ? ` lines="1-${displayedLines}"` : "" + xmlInfo = content.length > 0 ? `\n${content}\n` : `` + xmlInfo += `${truncateResult.notice}\n` + } else { + const lineRangeAttr = ` lines="1-${totalLines}"` + xmlInfo = totalLines > 0 ? `\n${content}\n` : `` + + if (totalLines === 0) { + xmlInfo += `File is empty\n` + } } // Track file read diff --git a/src/integrations/misc/__tests__/line-counter.spec.ts b/src/integrations/misc/__tests__/line-counter.spec.ts index e7d0f85c8c5..68011cdc2ce 100644 --- a/src/integrations/misc/__tests__/line-counter.spec.ts +++ b/src/integrations/misc/__tests__/line-counter.spec.ts @@ -1,146 +1,98 @@ -import type { Mock } from "vitest" -import fs from "fs" -import { countFileLines } from "../line-counter" +import { describe, it, expect, vi, beforeEach } from "vitest" +import { countFileLines, countFileLinesAndTokens } from "../line-counter" +import { countTokens } from "../../../utils/countTokens" +import { Readable } from "stream" -// Mock the fs module -vitest.mock("fs", () => ({ +// Mock dependencies +vi.mock("fs", () => ({ default: { promises: { - access: vitest.fn(), + access: vi.fn(), }, constants: { F_OK: 0, }, + createReadStream: vi.fn(), }, - createReadStream: vitest.fn(), + createReadStream: vi.fn(), })) -// Mock readline -vitest.mock("readline", () => ({ - createInterface: vitest.fn().mockReturnValue({ - on: vitest.fn().mockImplementation(function (this: any, event, callback) { - if (event === "line" && this.mockLines) { - for (let i = 0; i < this.mockLines; i++) { - callback() - } - } - if (event === "close") { - callback() - } - return this - }), - mockLines: 0, - }), +vi.mock("../../../utils/countTokens", () => ({ + countTokens: vi.fn(), })) -describe("countFileLines", () => { - beforeEach(() => { - vitest.clearAllMocks() - }) +const mockCountTokens = vi.mocked(countTokens) - it("should throw error if file does not exist", async () => { - // Setup - ;(fs.promises.access as Mock).mockRejectedValueOnce(new Error("File not found")) +// Get the mocked fs module +const fs = await import("fs") +const mockCreateReadStream = vi.mocked(fs.createReadStream) +const mockFsAccess = vi.mocked(fs.default.promises.access) - // Test & Assert - await expect(countFileLines("non-existent-file.txt")).rejects.toThrow("File not found") +describe("line-counter", () => { + beforeEach(() => { + vi.clearAllMocks() }) - it("should return the correct line count for a file", async () => { - // Setup - ;(fs.promises.access as Mock).mockResolvedValueOnce(undefined) - - const mockEventEmitter = { - on: vitest.fn().mockImplementation(function (this: any, event, callback) { - if (event === "line") { - // Simulate 10 lines - for (let i = 0; i < 10; i++) { - callback() - } - } - if (event === "close") { - callback() - } - return this - }), - } - - const mockReadStream = { - on: vitest.fn().mockImplementation(function (this: any, _event, _callback) { - return this - }), - } - - const { createReadStream } = await import("fs") - vitest.mocked(createReadStream).mockReturnValueOnce(mockReadStream as any) - const readline = await import("readline") - vitest.mocked(readline.createInterface).mockReturnValueOnce(mockEventEmitter as any) - - // Test - const result = await countFileLines("test-file.txt") - - // Assert - expect(result).toBe(10) - expect(fs.promises.access).toHaveBeenCalledWith("test-file.txt", fs.constants.F_OK) - expect(createReadStream).toHaveBeenCalledWith("test-file.txt") + describe("countFileLinesAndTokens", () => { + it("should count lines and tokens without budget limit", async () => { + // Create a proper readable stream + const mockStream = new Readable({ + read() { + this.push("line1\n") + this.push("line2\n") + this.push("line3\n") + this.push(null) // End of stream + }, + }) + + mockCreateReadStream.mockReturnValue(mockStream as any) + mockFsAccess.mockResolvedValue(undefined) + + // Mock token counting - simulate ~10 tokens per chunk + mockCountTokens.mockResolvedValue(30) + + const result = await countFileLinesAndTokens("/test/file.txt") + + expect(result.lineCount).toBe(3) + expect(result.tokenEstimate).toBe(30) + expect(result.complete).toBe(true) + }) + + it("should handle tokenizer errors with conservative estimate", async () => { + // Create a proper readable stream + const mockStream = new Readable({ + read() { + this.push("line1\n") + this.push(null) + }, + }) + + mockCreateReadStream.mockReturnValue(mockStream as any) + mockFsAccess.mockResolvedValue(undefined) + + // Simulate tokenizer error + mockCountTokens.mockRejectedValue(new Error("unreachable")) + + const result = await countFileLinesAndTokens("/test/file.txt") + + // Should still complete with conservative token estimate (content.length) + expect(result.lineCount).toBe(1) + expect(result.tokenEstimate).toBeGreaterThan(0) + expect(result.complete).toBe(true) + }) + + it("should throw error for non-existent files", async () => { + mockFsAccess.mockRejectedValue(new Error("ENOENT")) + + await expect(countFileLinesAndTokens("/nonexistent/file.txt")).rejects.toThrow("File not found") + }) }) - it("should handle files with no lines", async () => { - // Setup - ;(fs.promises.access as Mock).mockResolvedValueOnce(undefined) - - const mockEventEmitter = { - on: vitest.fn().mockImplementation(function (this: any, event, callback) { - if (event === "close") { - callback() - } - return this - }), - } - - const mockReadStream = { - on: vitest.fn().mockImplementation(function (this: any, _event, _callback) { - return this - }), - } - - const { createReadStream } = await import("fs") - vitest.mocked(createReadStream).mockReturnValueOnce(mockReadStream as any) - const readline = await import("readline") - vitest.mocked(readline.createInterface).mockReturnValueOnce(mockEventEmitter as any) - - // Test - const result = await countFileLines("empty-file.txt") - - // Assert - expect(result).toBe(0) - }) + describe("countFileLines", () => { + it("should throw error for non-existent files", async () => { + mockFsAccess.mockRejectedValue(new Error("ENOENT")) - it("should handle errors during reading", async () => { - // Setup - ;(fs.promises.access as Mock).mockResolvedValueOnce(undefined) - - const mockEventEmitter = { - on: vitest.fn().mockImplementation(function (this: any, event, callback) { - if (event === "error" && callback) { - callback(new Error("Read error")) - } - return this - }), - } - - const mockReadStream = { - on: vitest.fn().mockImplementation(function (this: any, _event, _callback) { - return this - }), - } - - const { createReadStream } = await import("fs") - vitest.mocked(createReadStream).mockReturnValueOnce(mockReadStream as any) - const readline = await import("readline") - vitest.mocked(readline.createInterface).mockReturnValueOnce(mockEventEmitter as any) - - // Test & Assert - await expect(countFileLines("error-file.txt")).rejects.toThrow("Read error") + await expect(countFileLines("/nonexistent/file.txt")).rejects.toThrow("File not found") + }) }) }) diff --git a/src/integrations/misc/line-counter.ts b/src/integrations/misc/line-counter.ts index c59736f1bee..d066d565e88 100644 --- a/src/integrations/misc/line-counter.ts +++ b/src/integrations/misc/line-counter.ts @@ -1,5 +1,7 @@ import fs, { createReadStream } from "fs" import { createInterface } from "readline" +import { countTokens } from "../../utils/countTokens" +import { Anthropic } from "@anthropic-ai/sdk" /** * Efficiently counts lines in a file using streams without loading the entire file into memory @@ -41,3 +43,125 @@ export async function countFileLines(filePath: string): Promise { }) }) } + +export interface LineAndTokenCountResult { + /** Total number of lines counted */ + lineCount: number + /** Estimated token count */ + tokenEstimate: number + /** Whether the full file was scanned (false if early exit occurred) */ + complete: boolean +} + +export interface LineAndTokenCountOptions { + /** Maximum tokens allowed before early exit. If undefined, scans entire file */ + budgetTokens?: number + /** Number of lines to buffer before running token estimation (default: 256) */ + chunkLines?: number +} + +/** + * Efficiently counts lines and estimates tokens in a file using streams with incremental token estimation. + * Processes file in chunks to avoid memory issues and can early-exit when budget is exceeded. + * + * @param filePath - Path to the file to analyze + * @param options - Configuration options for counting + * @returns A promise that resolves to line count, token estimate, and completion status + */ +export async function countFileLinesAndTokens( + filePath: string, + options: LineAndTokenCountOptions = {}, +): Promise { + const { budgetTokens, chunkLines = 256 } = options + + // Check if file exists + try { + await fs.promises.access(filePath, fs.constants.F_OK) + } catch (error) { + throw new Error(`File not found: ${filePath}`) + } + + return new Promise((resolve, reject) => { + let lineCount = 0 + let tokenEstimate = 0 + let lineBuffer: string[] = [] + let complete = true + let isProcessing = false + let shouldClose = false + + const readStream = createReadStream(filePath) + const rl = createInterface({ + input: readStream, + crlfDelay: Infinity, + }) + + const processBuffer = async () => { + if (lineBuffer.length === 0) return + + const bufferText = lineBuffer.join("\n") + lineBuffer = [] // Clear buffer before processing + + try { + const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text: bufferText }] + const chunkTokens = await countTokens(contentBlocks) + tokenEstimate += chunkTokens + } catch (error) { + // On tokenizer error, use conservative estimate: 2 char ≈ 1 token + tokenEstimate += Math.ceil(bufferText.length / 2) + } + + // Check if we've exceeded budget + if (budgetTokens !== undefined && tokenEstimate > budgetTokens) { + complete = false + shouldClose = true + rl.close() + readStream.destroy() + } + } + + rl.on("line", (line) => { + lineCount++ + lineBuffer.push(line) + + // Process buffer when it reaches chunk size + if (lineBuffer.length >= chunkLines && !isProcessing) { + isProcessing = true + rl.pause() + processBuffer() + .then(() => { + isProcessing = false + if (!shouldClose) { + rl.resume() + } + }) + .catch((err) => { + isProcessing = false + reject(err) + }) + } + }) + + rl.on("close", async () => { + // Wait for any ongoing processing to complete + while (isProcessing) { + await new Promise((r) => setTimeout(r, 10)) + } + + // Process any remaining lines in buffer + try { + await processBuffer() + resolve({ lineCount, tokenEstimate, complete }) + } catch (err) { + reject(err) + } + }) + + rl.on("error", (err) => { + reject(err) + }) + + readStream.on("error", (err) => { + reject(err) + }) + }) +}