From cb1da297473f05064d81d2b74d164d96a0a2bde6 Mon Sep 17 00:00:00 2001 From: gsxdsm Date: Tue, 24 Feb 2026 12:57:00 -0800 Subject: [PATCH 1/5] Changes from fix/cursor-fix --- apps/server/src/services/execution-service.ts | 49 +- .../lib/thinking-level-normalization.test.ts | 20 + .../services/agent-output-validation.test.ts | 192 ++++++++ .../unit/services/execution-service.test.ts | 444 +++++++++++++++++- .../dialogs/pr-comment-resolution-dialog.tsx | 20 +- .../board-view/dialogs/add-feature-dialog.tsx | 24 +- apps/ui/src/styles/themes/paper.css | 10 +- .../features/opus-thinking-level-none.spec.ts | 95 ++++ libs/types/src/index.ts | 1 + libs/types/src/settings.ts | 22 + 10 files changed, 829 insertions(+), 48 deletions(-) create mode 100644 apps/server/tests/unit/lib/thinking-level-normalization.test.ts create mode 100644 apps/server/tests/unit/services/agent-output-validation.test.ts create mode 100644 apps/ui/tests/features/opus-thinking-level-none.spec.ts diff --git a/apps/server/src/services/execution-service.ts b/apps/server/src/services/execution-service.ts index 6e23405a7..633ac8093 100644 --- a/apps/server/src/services/execution-service.ts +++ b/apps/server/src/services/execution-service.ts @@ -60,6 +60,12 @@ import type { const logger = createLogger('ExecutionService'); +/** Marker written by agent-executor for each tool invocation. */ +const TOOL_USE_MARKER = '🔧 Tool:'; + +/** Minimum trimmed output length to consider agent work meaningful. */ +const MIN_MEANINGFUL_OUTPUT_LENGTH = 200; + export class ExecutionService { constructor( private eventBus: TypedEventBus, @@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th } } - const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified'; + // Read agent output before determining final status. + // CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing + // meaningful work. Check output to avoid prematurely marking as 'verified'. + const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md'); + let agentOutput = ''; + try { + agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string; + } catch { + /* */ + } + + // Determine if the agent did meaningful work by checking for tool usage + // indicators in the output. The agent executor writes "🔧 Tool:" markers + // each time a tool is invoked. No tool usage suggests the CLI exited + // without performing implementation work. + const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER); + const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH; + const agentDidWork = hasToolUsage && !isOutputTooShort; + + let finalStatus: 'verified' | 'waiting_approval'; + if (feature.skipTests) { + finalStatus = 'waiting_approval'; + } else if (!agentDidWork) { + // Agent didn't produce meaningful output (e.g., CLI exited quickly). + // Route to waiting_approval so the user can review and re-run. + finalStatus = 'waiting_approval'; + logger.warn( + `[executeFeature] Feature ${featureId}: agent produced insufficient output ` + + `(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` + + `Setting status to waiting_approval instead of verified.` + ); + } else { + finalStatus = 'verified'; + } + await this.updateFeatureStatusFn(projectPath, featureId, finalStatus); this.recordSuccessFn(); @@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks; try { - const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md'); - let agentOutput = ''; - try { - agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string; - } catch { - /* */ - } if (agentOutput) { const summary = extractSummary(agentOutput); if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary); diff --git a/apps/server/tests/unit/lib/thinking-level-normalization.test.ts b/apps/server/tests/unit/lib/thinking-level-normalization.test.ts new file mode 100644 index 000000000..35f1b6e04 --- /dev/null +++ b/apps/server/tests/unit/lib/thinking-level-normalization.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect } from 'vitest'; +import { normalizeThinkingLevelForModel } from '@automaker/types'; + +describe('normalizeThinkingLevelForModel', () => { + it('preserves explicitly selected none for Opus models', () => { + expect(normalizeThinkingLevelForModel('claude-opus', 'none')).toBe('none'); + }); + + it('falls back to none when Opus receives an unsupported manual thinking level', () => { + expect(normalizeThinkingLevelForModel('claude-opus', 'medium')).toBe('none'); + }); + + it('keeps adaptive for Opus when adaptive is selected', () => { + expect(normalizeThinkingLevelForModel('claude-opus', 'adaptive')).toBe('adaptive'); + }); + + it('preserves supported manual levels for non-Opus models', () => { + expect(normalizeThinkingLevelForModel('claude-sonnet', 'high')).toBe('high'); + }); +}); diff --git a/apps/server/tests/unit/services/agent-output-validation.test.ts b/apps/server/tests/unit/services/agent-output-validation.test.ts new file mode 100644 index 000000000..693924694 --- /dev/null +++ b/apps/server/tests/unit/services/agent-output-validation.test.ts @@ -0,0 +1,192 @@ +import { describe, it, expect } from 'vitest'; + +/** + * Contract tests verifying the tool marker format used by agent-executor + * (which writes agent output) and execution-service (which reads it to + * determine if the agent did meaningful work). + * + * The agent-executor writes: `\n🔧 Tool: ${block.name}\n` + * The execution-service checks: `agentOutput.includes('🔧 Tool:')` + * + * These tests ensure the marker format contract stays consistent and + * document the exact detection logic used for status determination. + */ + +// The exact marker prefix that execution-service searches for +const TOOL_MARKER = '🔧 Tool:'; + +// Minimum output length threshold for "meaningful work" +const MIN_OUTPUT_LENGTH = 200; + +/** + * Simulates the agent-executor's tool_use output format. + * See: agent-executor.ts line ~293 + */ +function formatToolUseBlock(toolName: string, input?: Record): string { + let output = `\n${TOOL_MARKER} ${toolName}\n`; + if (input) output += `Input: ${JSON.stringify(input, null, 2)}\n`; + return output; +} + +/** + * Simulates the execution-service's output validation logic. + * See: execution-service.ts lines ~427-429 + */ +function validateAgentOutput( + agentOutput: string, + skipTests: boolean +): 'verified' | 'waiting_approval' { + const hasToolUsage = agentOutput.includes(TOOL_MARKER); + const hasMinimalOutput = agentOutput.trim().length < MIN_OUTPUT_LENGTH; + const agentDidWork = hasToolUsage && !hasMinimalOutput; + + if (skipTests) return 'waiting_approval'; + if (!agentDidWork) return 'waiting_approval'; + return 'verified'; +} + +describe('Agent Output Validation - Contract Tests', () => { + describe('tool marker format contract', () => { + it('agent-executor tool format contains the expected marker', () => { + const toolOutput = formatToolUseBlock('Read', { file_path: '/src/index.ts' }); + expect(toolOutput).toContain(TOOL_MARKER); + }); + + it('agent-executor tool format includes tool name after marker', () => { + const toolOutput = formatToolUseBlock('Edit', { + file_path: '/src/app.ts', + old_string: 'foo', + new_string: 'bar', + }); + expect(toolOutput).toContain('🔧 Tool: Edit'); + }); + + it('agent-executor tool format includes JSON input', () => { + const input = { file_path: '/src/index.ts' }; + const toolOutput = formatToolUseBlock('Read', input); + expect(toolOutput).toContain('Input: '); + expect(toolOutput).toContain('"file_path": "/src/index.ts"'); + }); + + it('agent-executor tool format works without input', () => { + const toolOutput = formatToolUseBlock('Bash'); + expect(toolOutput).toContain('🔧 Tool: Bash'); + expect(toolOutput).not.toContain('Input:'); + }); + + it('marker includes colon and space to avoid false positives', () => { + // Ensure the marker is specific enough to avoid matching other emoji patterns + expect(TOOL_MARKER).toBe('🔧 Tool:'); + expect(TOOL_MARKER).toContain(':'); + }); + }); + + describe('output validation logic', () => { + it('verified: tool usage + sufficient output', () => { + const output = + 'Starting implementation of the new feature...\n' + + formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + + 'I can see the existing code. Let me make the needed changes.\n' + + formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) + + 'Changes complete. The implementation adds new validation logic and tests.'; + expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH); + + expect(validateAgentOutput(output, false)).toBe('verified'); + }); + + it('waiting_approval: no tool markers regardless of length', () => { + const longOutput = 'I analyzed the codebase. '.repeat(50); + expect(longOutput.trim().length).toBeGreaterThan(MIN_OUTPUT_LENGTH); + + expect(validateAgentOutput(longOutput, false)).toBe('waiting_approval'); + }); + + it('waiting_approval: tool markers but insufficient length', () => { + const shortOutput = formatToolUseBlock('Read', { file_path: '/src/a.ts' }); + expect(shortOutput.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH); + + expect(validateAgentOutput(shortOutput, false)).toBe('waiting_approval'); + }); + + it('waiting_approval: empty output', () => { + expect(validateAgentOutput('', false)).toBe('waiting_approval'); + }); + + it('waiting_approval: skipTests always overrides', () => { + const goodOutput = + 'Starting...\n' + + formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + + formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) + + 'Done implementing. '.repeat(15); + expect(goodOutput.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH); + + expect(validateAgentOutput(goodOutput, true)).toBe('waiting_approval'); + }); + + it('boundary: exactly MIN_OUTPUT_LENGTH chars with tool is verified', () => { + const tool = formatToolUseBlock('Read'); + const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - tool.trim().length); + const output = tool + padding; + expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH); + + expect(validateAgentOutput(output, false)).toBe('verified'); + }); + + it('boundary: MIN_OUTPUT_LENGTH - 1 chars with tool is waiting_approval', () => { + const marker = `${TOOL_MARKER} Read\n`; + const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - 1 - marker.length); + const output = marker + padding; + expect(output.trim().length).toBe(MIN_OUTPUT_LENGTH - 1); + + expect(validateAgentOutput(output, false)).toBe('waiting_approval'); + }); + }); + + describe('realistic provider scenarios', () => { + it('Claude SDK agent with multiple tools → verified', () => { + let output = "I'll implement the feature.\n\n"; + output += formatToolUseBlock('Read', { file_path: '/src/components/App.tsx' }); + output += 'I see the component. Let me update it.\n\n'; + output += formatToolUseBlock('Edit', { + file_path: '/src/components/App.tsx', + old_string: 'const App = () => {', + new_string: 'const App: React.FC = () => {', + }); + output += 'Done. The component is now typed correctly.\n'; + + expect(validateAgentOutput(output, false)).toBe('verified'); + }); + + it('Cursor CLI quick exit (no tools) → waiting_approval', () => { + const output = 'Task received. Processing...\nResult: completed successfully.'; + expect(validateAgentOutput(output, false)).toBe('waiting_approval'); + }); + + it('Codex CLI with brief acknowledgment → waiting_approval', () => { + const output = 'Understood the task. Starting implementation.\nDone.'; + expect(validateAgentOutput(output, false)).toBe('waiting_approval'); + }); + + it('Agent that only reads but makes no edits (single Read tool, short output) → waiting_approval', () => { + const output = formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + 'File read.'; + expect(output.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH); + expect(validateAgentOutput(output, false)).toBe('waiting_approval'); + }); + + it('Agent with extensive tool usage and explanation → verified', () => { + let output = 'Analyzing the codebase for the authentication feature.\n\n'; + for (let i = 0; i < 5; i++) { + output += formatToolUseBlock('Read', { file_path: `/src/auth/handler${i}.ts` }); + output += `Found handler ${i}. `; + } + output += formatToolUseBlock('Edit', { + file_path: '/src/auth/handler0.ts', + old_string: 'function login() {}', + new_string: 'async function login(creds: Credentials) { ... }', + }); + output += 'Implementation complete with all authentication changes applied.\n'; + + expect(validateAgentOutput(output, false)).toBe('verified'); + }); + }); +}); diff --git a/apps/server/tests/unit/services/execution-service.test.ts b/apps/server/tests/unit/services/execution-service.test.ts index 22aa4ed5e..7c2f3e0f9 100644 --- a/apps/server/tests/unit/services/execution-service.test.ts +++ b/apps/server/tests/unit/services/execution-service.test.ts @@ -211,7 +211,14 @@ describe('execution-service.ts', () => { }); // Default mocks for secureFs - vi.mocked(secureFs.readFile).mockResolvedValue('Agent output content'); + // Include tool usage markers to simulate meaningful agent output. + // The execution service checks for '🔧 Tool:' markers and minimum + // output length to determine if the agent did real work. + vi.mocked(secureFs.readFile).mockResolvedValue( + 'Starting implementation...\n\n🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\n\n' + + '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts", "old_string": "foo", "new_string": "bar"}\n\n' + + 'Implementation complete. Updated the code as requested.' + ); vi.mocked(secureFs.access).mockResolvedValue(undefined); // Re-setup platform mocks @@ -1433,4 +1440,439 @@ describe('execution-service.ts', () => { ); }); }); + + describe('executeFeature - agent output validation', () => { + // Helper to generate realistic agent output with tool markers + const makeAgentOutput = (toolCount: number, extraText = ''): string => { + let output = 'Starting implementation...\n\n'; + for (let i = 0; i < toolCount; i++) { + output += `🔧 Tool: Edit\nInput: {"file_path": "/src/file${i}.ts", "old_string": "old${i}", "new_string": "new${i}"}\n\n`; + } + output += `Implementation complete. ${extraText}`; + return output; + }; + + const createServiceWithMocks = () => { + return new ExecutionService( + mockEventBus, + mockConcurrencyManager, + mockWorktreeResolver, + mockSettingsService, + mockRunAgentFn, + mockExecutePipelineFn, + mockUpdateFeatureStatusFn, + mockLoadFeatureFn, + mockGetPlanningPromptPrefixFn, + mockSaveFeatureSummaryFn, + mockRecordLearningsFn, + mockContextExistsFn, + mockResumeFeatureFn, + mockTrackFailureFn, + mockSignalPauseFn, + mockRecordSuccessFn, + mockSaveExecutionStateFn, + mockLoadContextFilesFn + ); + }; + + it('sets verified when agent output has tool usage and sufficient length', async () => { + const output = makeAgentOutput(3, 'Updated authentication module with new login flow.'); + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + await service.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'verified' + ); + }); + + it('sets waiting_approval when agent output is empty', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(''); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('sets waiting_approval when agent output has no tool usage markers', async () => { + // Long output but no tool markers - agent printed text but didn't use tools + const longOutputNoTools = 'I analyzed the codebase and found several issues. '.repeat(20); + vi.mocked(secureFs.readFile).mockResolvedValue(longOutputNoTools); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('sets waiting_approval when agent output has tool markers but is too short', async () => { + // Has a tool marker but total output is under 200 chars + const shortWithTool = '🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\nDone.'; + expect(shortWithTool.trim().length).toBeLessThan(200); + + vi.mocked(secureFs.readFile).mockResolvedValue(shortWithTool); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('sets waiting_approval when agent output file is missing (ENOENT)', async () => { + vi.mocked(secureFs.readFile).mockRejectedValue(new Error('ENOENT')); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('sets waiting_approval when agent output is only whitespace', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(' \n\n\t \n '); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('sets verified when output is exactly at the 200 char threshold with tool usage', async () => { + // Create output that's exactly 200 chars trimmed with tool markers + const toolMarker = '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts"}\n'; + const padding = 'x'.repeat(200 - toolMarker.length); + const output = toolMarker + padding; + expect(output.trim().length).toBeGreaterThanOrEqual(200); + + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'verified' + ); + }); + + it('sets waiting_approval when output is 199 chars with tool usage (below threshold)', async () => { + const toolMarker = '🔧 Tool: Read\n'; + const padding = 'x'.repeat(199 - toolMarker.length); + const output = toolMarker + padding; + expect(output.trim().length).toBe(199); + + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('skipTests always takes priority over output validation', async () => { + // Meaningful output with tool usage - would normally be 'verified' + const output = makeAgentOutput(5, 'All changes applied successfully.'); + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true }); + const svc = createServiceWithMocks(); + + await svc.executeFeature('/test/project', 'feature-1'); + + // skipTests=true always means waiting_approval regardless of output quality + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('skipTests with empty output still results in waiting_approval', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(''); + + mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true }); + const svc = createServiceWithMocks(); + + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('still records success even when output validation fails', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(''); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // recordSuccess should still be called - the agent ran without errors + expect(mockRecordSuccessFn).toHaveBeenCalled(); + }); + + it('still extracts summary when output has content but no tool markers', async () => { + const outputNoTools = 'A '.repeat(150); // > 200 chars but no tool markers + vi.mocked(secureFs.readFile).mockResolvedValue(outputNoTools); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Summary extraction still runs even though status is waiting_approval + expect(extractSummary).toHaveBeenCalledWith(outputNoTools); + expect(mockSaveFeatureSummaryFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'Test summary' + ); + }); + + it('emits feature_complete with passes=true even when output validation routes to waiting_approval', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(''); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1', false, true); + + // The agent ran without error - it's still a "pass" from the execution perspective + expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith( + 'auto_mode_feature_complete', + expect.objectContaining({ passes: true }) + ); + }); + + it('handles realistic Cursor CLI output that exits quickly', async () => { + // Simulates a Cursor CLI that prints a brief message and exits + const cursorQuickExit = 'Task received. Processing...\nResult: completed successfully.'; + expect(cursorQuickExit.includes('🔧 Tool:')).toBe(false); + + vi.mocked(secureFs.readFile).mockResolvedValue(cursorQuickExit); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // No tool usage = waiting_approval + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('handles realistic Claude SDK output with multiple tool uses', async () => { + // Simulates a Claude SDK agent that does real work + const claudeOutput = + "I'll implement the requested feature.\n\n" + + '🔧 Tool: Read\nInput: {"file_path": "/src/components/App.tsx"}\n\n' + + 'I can see the existing component structure. Let me modify it.\n\n' + + '🔧 Tool: Edit\nInput: {"file_path": "/src/components/App.tsx", "old_string": "const App = () => {", "new_string": "const App: React.FC = () => {"}\n\n' + + '🔧 Tool: Write\nInput: {"file_path": "/src/components/NewFeature.tsx"}\n\n' + + "I've created the new component and updated the existing one. The feature is now implemented with proper TypeScript types."; + + vi.mocked(secureFs.readFile).mockResolvedValue(claudeOutput); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Real work = verified + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'verified' + ); + }); + + it('reads agent output from the correct path with utf-8 encoding', async () => { + const output = makeAgentOutput(2, 'Done with changes.'); + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Verify readFile was called with the correct path derived from getFeatureDir + expect(secureFs.readFile).toHaveBeenCalledWith( + '/test/project/.automaker/features/feature-1/agent-output.md', + 'utf-8' + ); + }); + + it('completion message includes auto-verified when status is verified', async () => { + const output = makeAgentOutput(3, 'All changes applied.'); + vi.mocked(secureFs.readFile).mockResolvedValue(output); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1', false, true); + + expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith( + 'auto_mode_feature_complete', + expect.objectContaining({ + message: expect.stringContaining('auto-verified'), + }) + ); + }); + + it('completion message does NOT include auto-verified when status is waiting_approval', async () => { + // Empty output → waiting_approval + vi.mocked(secureFs.readFile).mockResolvedValue(''); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1', false, true); + + const completeCall = vi + .mocked(mockEventBus.emitAutoModeEvent) + .mock.calls.find((call) => call[0] === 'auto_mode_feature_complete'); + expect(completeCall).toBeDefined(); + expect((completeCall![1] as { message: string }).message).not.toContain('auto-verified'); + }); + + it('uses same agentOutput for both status determination and summary extraction', async () => { + // Specific output that is long enough with tool markers (verified path) + // AND has content for summary extraction + const specificOutput = + '🔧 Tool: Read\nReading file...\n🔧 Tool: Edit\nEditing file...\n' + + 'The implementation is complete. Here is a detailed description of what was done. '.repeat( + 3 + ); + vi.mocked(secureFs.readFile).mockResolvedValue(specificOutput); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Status should be verified (has tools + long enough) + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'verified' + ); + // extractSummary should receive the exact same output + expect(extractSummary).toHaveBeenCalledWith(specificOutput); + // recordLearnings should also receive the same output + expect(mockRecordLearningsFn).toHaveBeenCalledWith( + '/test/project', + testFeature, + specificOutput + ); + }); + + it('does not call recordMemoryUsage when output is empty and memoryFiles is empty', async () => { + vi.mocked(secureFs.readFile).mockResolvedValue(''); + const { recordMemoryUsage } = await import('@automaker/utils'); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // With empty output and empty memoryFiles, recordMemoryUsage should not be called + expect(recordMemoryUsage).not.toHaveBeenCalled(); + }); + + it('handles output with special unicode characters correctly', async () => { + // Output with various unicode but includes tool markers + const unicodeOutput = + '🔧 Tool: Read\n' + + '🔧 Tool: Edit\n' + + 'Añadiendo función de búsqueda con caracteres especiales: ñ, ü, ö, é, 日本語テスト. ' + + 'Die Änderungen wurden erfolgreich implementiert. '.repeat(3); + vi.mocked(secureFs.readFile).mockResolvedValue(unicodeOutput); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Should still detect tool markers and sufficient length + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'verified' + ); + }); + + it('treats output with only newlines and spaces around tool marker as insufficient', async () => { + // Has tool marker but surrounded by whitespace, total trimmed < 200 + const sparseOutput = '\n\n 🔧 Tool: Read \n\n'; + expect(sparseOutput.trim().length).toBeLessThan(200); + + vi.mocked(secureFs.readFile).mockResolvedValue(sparseOutput); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('detects tool marker substring correctly (partial match like "🔧 Tools:" does not count)', async () => { + // Output with a similar but not exact marker - "🔧 Tools:" instead of "🔧 Tool:" + const wrongMarker = '🔧 Tools: Read\n🔧 Tools: Edit\n' + 'Implementation done. '.repeat(20); + expect(wrongMarker.includes('🔧 Tool:')).toBe(false); + + vi.mocked(secureFs.readFile).mockResolvedValue(wrongMarker); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // "🔧 Tools:" is not the same as "🔧 Tool:" - should be waiting_approval + expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith( + '/test/project', + 'feature-1', + 'waiting_approval' + ); + }); + + it('pipeline merge_conflict status short-circuits before output validation', async () => { + // Set up pipeline that results in merge_conflict + vi.mocked(pipelineService.getPipelineConfig).mockResolvedValue({ + version: 1, + steps: [{ id: 'step-1', name: 'Step 1', order: 1, instructions: 'Do step 1' }] as any, + }); + + // After pipeline, loadFeature returns merge_conflict status + let loadCallCount = 0; + mockLoadFeatureFn = vi.fn().mockImplementation(() => { + loadCallCount++; + if (loadCallCount === 1) return testFeature; // initial load + // All subsequent loads (task check + pipeline refresh) return merge_conflict + return { ...testFeature, status: 'merge_conflict' }; + }); + + const svc = createServiceWithMocks(); + await svc.executeFeature('/test/project', 'feature-1'); + + // Should NOT have called updateFeatureStatusFn with 'verified' or 'waiting_approval' + // because pipeline merge_conflict short-circuits the method + const statusCalls = vi + .mocked(mockUpdateFeatureStatusFn) + .mock.calls.filter((call) => call[2] === 'verified' || call[2] === 'waiting_approval'); + // The only non-in_progress status call should be absent since merge_conflict returns early + expect(statusCalls.length).toBe(0); + }); + }); }); diff --git a/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx b/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx index b0f86aa1d..fd5c01e7e 100644 --- a/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx +++ b/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx @@ -45,7 +45,7 @@ import { toast } from 'sonner'; import type { PRReviewComment } from '@/lib/electron'; import type { Feature } from '@/store/app-store'; import type { PhaseModelEntry } from '@automaker/types'; -import { supportsReasoningEffort, isAdaptiveThinkingModel } from '@automaker/types'; +import { supportsReasoningEffort, normalizeThinkingLevelForModel } from '@automaker/types'; import { resolveModelString } from '@automaker/model-resolver'; import { PhaseModelSelector } from '@/components/views/settings-view/model-defaults'; @@ -590,20 +590,10 @@ export function PRCommentResolutionDialog({ const wasOpenRef = useRef(false); const handleModelChange = useCallback((entry: PhaseModelEntry) => { - // Normalize thinking level when switching between adaptive and non-adaptive models - const isNewModelAdaptive = - typeof entry.model === 'string' && isAdaptiveThinkingModel(entry.model); - const currentLevel = entry.thinkingLevel || 'none'; - - if (isNewModelAdaptive && currentLevel !== 'none' && currentLevel !== 'adaptive') { - // Switching TO an adaptive model with a manual level -> auto-switch to 'adaptive' - setModelEntry({ ...entry, thinkingLevel: 'adaptive' }); - } else if (!isNewModelAdaptive && currentLevel === 'adaptive') { - // Switching FROM an adaptive model with adaptive -> auto-switch to 'high' - setModelEntry({ ...entry, thinkingLevel: 'high' }); - } else { - setModelEntry(entry); - } + const modelId = typeof entry.model === 'string' ? entry.model : ''; + const normalizedThinkingLevel = normalizeThinkingLevelForModel(modelId, entry.thinkingLevel); + + setModelEntry({ ...entry, thinkingLevel: normalizedThinkingLevel }); }, []); // Fetch PR review comments diff --git a/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx b/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx index 86a684646..b62edfc1a 100644 --- a/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx +++ b/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx @@ -28,11 +28,7 @@ import { cn } from '@/lib/utils'; import { modelSupportsThinking } from '@/lib/utils'; import { useAppStore, ThinkingLevel, FeatureImage, PlanningMode, Feature } from '@/store/app-store'; import type { ReasoningEffort, PhaseModelEntry, AgentModel } from '@automaker/types'; -import { - supportsReasoningEffort, - isAdaptiveThinkingModel, - getThinkingLevelsForModel, -} from '@automaker/types'; +import { supportsReasoningEffort, normalizeThinkingLevelForModel } from '@automaker/types'; import { PrioritySelector, WorkModeSelector, @@ -308,20 +304,10 @@ export function AddFeatureDialog({ }, [planningMode]); const handleModelChange = (entry: PhaseModelEntry) => { - // Normalize thinking level when switching between adaptive and non-adaptive models - const isNewModelAdaptive = - typeof entry.model === 'string' && isAdaptiveThinkingModel(entry.model); - const currentLevel = entry.thinkingLevel || 'none'; - - if (isNewModelAdaptive && currentLevel !== 'none' && currentLevel !== 'adaptive') { - // Switching TO Opus 4.6 with a manual level -> auto-switch to 'adaptive' - setModelEntry({ ...entry, thinkingLevel: 'adaptive' }); - } else if (!isNewModelAdaptive && currentLevel === 'adaptive') { - // Switching FROM Opus 4.6 with adaptive -> auto-switch to 'high' - setModelEntry({ ...entry, thinkingLevel: 'high' }); - } else { - setModelEntry(entry); - } + const modelId = typeof entry.model === 'string' ? entry.model : ''; + const normalizedThinkingLevel = normalizeThinkingLevelForModel(modelId, entry.thinkingLevel); + + setModelEntry({ ...entry, thinkingLevel: normalizedThinkingLevel }); }; const buildFeatureData = (): FeatureData | null => { diff --git a/apps/ui/src/styles/themes/paper.css b/apps/ui/src/styles/themes/paper.css index 620f41733..290bb69a3 100644 --- a/apps/ui/src/styles/themes/paper.css +++ b/apps/ui/src/styles/themes/paper.css @@ -33,11 +33,11 @@ --input: oklch(0.98 0 0); --ring: oklch(0.3 0 0); - --chart-1: oklch(0.3 0 0); - --chart-2: oklch(0.5 0 0); - --chart-3: oklch(0.4 0 0); - --chart-4: oklch(0.6 0 0); - --chart-5: oklch(0.35 0 0); + --chart-1: oklch(0.5 0.14 25); /* Warm red - strings, regex */ + --chart-2: oklch(0.5 0.13 250); /* Blue - properties, variables */ + --chart-3: oklch(0.55 0.13 145); /* Green - numbers */ + --chart-4: oklch(0.45 0.14 300); /* Purple - keywords, booleans, tags */ + --chart-5: oklch(0.5 0.12 180); /* Teal - types, classes */ --sidebar: oklch(0.98 0 0); --sidebar-foreground: oklch(0.15 0 0); diff --git a/apps/ui/tests/features/opus-thinking-level-none.spec.ts b/apps/ui/tests/features/opus-thinking-level-none.spec.ts new file mode 100644 index 000000000..6c421deaa --- /dev/null +++ b/apps/ui/tests/features/opus-thinking-level-none.spec.ts @@ -0,0 +1,95 @@ +import { test, expect } from '@playwright/test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + createTempDirPath, + cleanupTempDir, + setupRealProject, + waitForNetworkIdle, + clickAddFeature, + fillAddFeatureDialog, + confirmAddFeature, + authenticateForTests, + handleLoginScreenIfPresent, +} from '../utils'; + +const TEST_TEMP_DIR = createTempDirPath('opus-thinking-level-none'); + +test.describe('Opus thinking level', () => { + let projectPath: string; + const projectName = `test-project-${Date.now()}`; + + test.beforeAll(async () => { + if (!fs.existsSync(TEST_TEMP_DIR)) { + fs.mkdirSync(TEST_TEMP_DIR, { recursive: true }); + } + + projectPath = path.join(TEST_TEMP_DIR, projectName); + fs.mkdirSync(projectPath, { recursive: true }); + + fs.writeFileSync( + path.join(projectPath, 'package.json'), + JSON.stringify({ name: projectName, version: '1.0.0' }, null, 2) + ); + + const automakerDir = path.join(projectPath, '.automaker'); + fs.mkdirSync(automakerDir, { recursive: true }); + fs.mkdirSync(path.join(automakerDir, 'features'), { recursive: true }); + fs.mkdirSync(path.join(automakerDir, 'context'), { recursive: true }); + + fs.writeFileSync( + path.join(automakerDir, 'categories.json'), + JSON.stringify({ categories: [] }, null, 2) + ); + + fs.writeFileSync( + path.join(automakerDir, 'app_spec.txt'), + `# ${projectName}\n\nA test project for Opus thinking level e2e coverage.` + ); + }); + + test.afterAll(async () => { + cleanupTempDir(TEST_TEMP_DIR); + }); + + test('persists thinkingLevel none when selected for Claude Opus', async ({ page }) => { + const featureDescription = `Opus none thinking ${Date.now()}`; + + await setupRealProject(page, projectPath, projectName, { setAsCurrent: true }); + await authenticateForTests(page); + await page.goto('/board'); + await page.waitForLoadState('load'); + await handleLoginScreenIfPresent(page); + await waitForNetworkIdle(page); + + await clickAddFeature(page); + await fillAddFeatureDialog(page, featureDescription); + + await page.locator('[data-testid="model-selector"]').click(); + await page.locator('[cmdk-input]').fill('opus'); + + const opusItem = page.locator('[cmdk-item]').filter({ hasText: 'Claude Opus' }).first(); + await expect(opusItem).toBeVisible({ timeout: 10000 }); + await opusItem.locator('button[title="Adjust thinking level"]').click(); + + const noneOption = page.getByRole('button', { name: /None/i }).first(); + await noneOption.click(); + + await expect(page.locator('[data-testid="model-selector"]')).not.toContainText('Adaptive'); + + await confirmAddFeature(page); + + const featuresDir = path.join(projectPath, '.automaker', 'features'); + await expect.poll(() => fs.readdirSync(featuresDir).length).toBe(1); + + const featureDir = fs.readdirSync(featuresDir)[0]; + const featureJsonPath = path.join(featuresDir, featureDir, 'feature.json'); + const featureJson = JSON.parse(fs.readFileSync(featureJsonPath, 'utf-8')) as { + description: string; + thinkingLevel: string; + }; + + expect(featureJson.description).toBe(featureDescription); + expect(featureJson.thinkingLevel).toBe('none'); + }); +}); diff --git a/libs/types/src/index.ts b/libs/types/src/index.ts index 21d7f7c0c..b52df4131 100644 --- a/libs/types/src/index.ts +++ b/libs/types/src/index.ts @@ -202,6 +202,7 @@ export { getThinkingTokenBudget, isAdaptiveThinkingModel, getThinkingLevelsForModel, + normalizeThinkingLevelForModel, getDefaultThinkingLevel, // Event hook constants EVENT_HOOK_TRIGGER_LABELS, diff --git a/libs/types/src/settings.ts b/libs/types/src/settings.ts index 759caedf0..52e881e84 100644 --- a/libs/types/src/settings.ts +++ b/libs/types/src/settings.ts @@ -349,6 +349,28 @@ export function getThinkingLevelsForModel(model: string): ThinkingLevel[] { return ['none', 'low', 'medium', 'high', 'ultrathink']; } +/** + * Normalize a selected thinking level to a value supported by the target model. + * Prefers preserving the selected level, falls back to 'none' when available. + */ +export function normalizeThinkingLevelForModel( + model: string, + thinkingLevel: ThinkingLevel | undefined +): ThinkingLevel { + const availableLevels = getThinkingLevelsForModel(model); + const currentLevel = thinkingLevel || 'none'; + + if (availableLevels.includes(currentLevel)) { + return currentLevel; + } + + if (availableLevels.includes('none')) { + return 'none'; + } + + return availableLevels[0]; +} + /** * Get the default thinking level for a given model. * Used when selecting a model via the primary button in the two-stage selector. From 70145177f3e5986a428de99ef40af64ec43337cc Mon Sep 17 00:00:00 2001 From: gsxdsm Date: Tue, 24 Feb 2026 17:25:58 -0800 Subject: [PATCH 2/5] feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port --- apps/server/src/providers/copilot-provider.ts | 7 +- apps/server/src/providers/cursor-provider.ts | 6 +- apps/server/src/providers/gemini-provider.ts | 9 +- apps/server/src/services/agent-executor.ts | 56 +++- .../unit/providers/copilot-provider.test.ts | 39 +++ .../unit/providers/cursor-provider.test.ts | 120 ++++++- .../unit/providers/gemini-provider.test.ts | 137 ++++++++ .../unit/services/agent-executor.test.ts | 303 ++++++++++++++++++ apps/ui/playwright.config.ts | 4 +- apps/ui/scripts/kill-test-servers.mjs | 4 +- .../board-view/dialogs/add-feature-dialog.tsx | 6 +- .../model-defaults/phase-model-selector.tsx | 20 +- .../features/opus-thinking-level-none.spec.ts | 17 +- .../planning-mode-fix-verification.spec.ts | 9 +- apps/ui/tests/utils/core/waiting.ts | 11 +- apps/ui/tests/utils/views/board.ts | 14 +- apps/ui/vite.config.mts | 2 +- libs/types/src/provider.ts | 8 +- 18 files changed, 735 insertions(+), 37 deletions(-) diff --git a/apps/server/src/providers/copilot-provider.ts b/apps/server/src/providers/copilot-provider.ts index 5ccdfbf0d..b76c5cd4a 100644 --- a/apps/server/src/providers/copilot-provider.ts +++ b/apps/server/src/providers/copilot-provider.ts @@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider { case 'session.error': { const errorEvent = sdkEvent as SdkSessionErrorEvent; + const enrichedError = + errorEvent.data.message || + (errorEvent.data.code + ? `Copilot agent error (code: ${errorEvent.data.code})` + : 'Copilot agent error'); return { type: 'error', - error: errorEvent.data.message || 'Unknown error', + error: enrichedError, }; } diff --git a/apps/server/src/providers/cursor-provider.ts b/apps/server/src/providers/cursor-provider.ts index 450b3a74a..6c0d98e75 100644 --- a/apps/server/src/providers/cursor-provider.ts +++ b/apps/server/src/providers/cursor-provider.ts @@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider { const resultEvent = cursorEvent as CursorResultEvent; if (resultEvent.is_error) { + const errorText = resultEvent.error || resultEvent.result || ''; + const enrichedError = + errorText || + `Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`; return { type: 'error', session_id: resultEvent.session_id, - error: resultEvent.error || resultEvent.result || 'Unknown error', + error: enrichedError, }; } diff --git a/apps/server/src/providers/gemini-provider.ts b/apps/server/src/providers/gemini-provider.ts index 4d2304771..f9425d907 100644 --- a/apps/server/src/providers/gemini-provider.ts +++ b/apps/server/src/providers/gemini-provider.ts @@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider { const resultEvent = geminiEvent as GeminiResultEvent; if (resultEvent.status === 'error') { + const enrichedError = + resultEvent.error || + `Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`; return { type: 'error', session_id: resultEvent.session_id, - error: resultEvent.error || 'Unknown error', + error: enrichedError, }; } @@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider { case 'error': { const errorEvent = geminiEvent as GeminiResultEvent; + const enrichedError = + errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`; return { type: 'error', session_id: errorEvent.session_id, - error: errorEvent.error || 'Unknown error', + error: enrichedError, }; } diff --git a/apps/server/src/services/agent-executor.ts b/apps/server/src/services/agent-executor.ts index a3c88f6c8..f79307665 100644 --- a/apps/server/src/services/agent-executor.ts +++ b/apps/server/src/services/agent-executor.ts @@ -296,8 +296,28 @@ export class AgentExecutor { } } } else if (msg.type === 'error') { - throw new Error(AgentExecutor.sanitizeProviderError(msg.error)); - } else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite(); + const sanitized = AgentExecutor.sanitizeProviderError(msg.error); + logger.error( + `[execute] Feature ${featureId} received error from provider. ` + + `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}` + ); + throw new Error(sanitized); + } else if (msg.type === 'result') { + if (msg.subtype === 'success') { + scheduleWrite(); + } else if (msg.subtype?.startsWith('error')) { + // Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.) + logger.error( + `[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` + + `session_id=${msg.session_id ?? 'none'}` + ); + throw new Error(`Agent execution ended with: ${msg.subtype}`); + } else { + logger.warn( + `[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}` + ); + } + } } } finally { clearInterval(streamHeartbeat); @@ -447,16 +467,28 @@ export class AgentExecutor { }); } } else if (msg.type === 'error') { - // Clean the error: strip ANSI codes and redundant "Error: " prefix - const cleanedError = - (msg.error || `Error during task ${task.id}`) - .replace(/\x1b\[[0-9;]*m/g, '') - .replace(/^Error:\s*/i, '') - .trim() || `Error during task ${task.id}`; - throw new Error(cleanedError); - } else if (msg.type === 'result' && msg.subtype === 'success') { - taskOutput += msg.result || ''; - responseText += msg.result || ''; + const fallback = `Error during task ${task.id}`; + const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback); + logger.error( + `[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` + + `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}` + ); + throw new Error(sanitized); + } else if (msg.type === 'result') { + if (msg.subtype === 'success') { + taskOutput += msg.result || ''; + responseText += msg.result || ''; + } else if (msg.subtype?.startsWith('error')) { + logger.error( + `[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` + + `session_id=${msg.session_id ?? 'none'}` + ); + throw new Error(`Agent execution ended with: ${msg.subtype}`); + } else { + logger.warn( + `[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}` + ); + } } } if (!taskCompleteDetected) diff --git a/apps/server/tests/unit/providers/copilot-provider.test.ts b/apps/server/tests/unit/providers/copilot-provider.test.ts index 552e75304..55db34dfa 100644 --- a/apps/server/tests/unit/providers/copilot-provider.test.ts +++ b/apps/server/tests/unit/providers/copilot-provider.test.ts @@ -397,6 +397,45 @@ describe('copilot-provider.ts', () => { }); }); + it('should use error code in fallback when session.error message is empty', () => { + const event = { + type: 'session.error', + data: { message: '', code: 'RATE_LIMIT_EXCEEDED' }, + }; + + const result = provider.normalizeEvent(event); + expect(result).not.toBeNull(); + expect(result!.type).toBe('error'); + expect(result!.error).toContain('RATE_LIMIT_EXCEEDED'); + expect(result!.error).not.toBe('Unknown error'); + }); + + it('should return generic "Copilot agent error" fallback when both message and code are empty', () => { + const event = { + type: 'session.error', + data: { message: '', code: '' }, + }; + + const result = provider.normalizeEvent(event); + expect(result).not.toBeNull(); + expect(result!.type).toBe('error'); + expect(result!.error).toBe('Copilot agent error'); + // Must NOT be the old opaque 'Unknown error' + expect(result!.error).not.toBe('Unknown error'); + }); + + it('should return generic "Copilot agent error" fallback when data has no code field', () => { + const event = { + type: 'session.error', + data: { message: '' }, + }; + + const result = provider.normalizeEvent(event); + expect(result).not.toBeNull(); + expect(result!.type).toBe('error'); + expect(result!.error).toBe('Copilot agent error'); + }); + it('should return null for unknown event types', () => { const event = { type: 'unknown.event' }; diff --git a/apps/server/tests/unit/providers/cursor-provider.test.ts b/apps/server/tests/unit/providers/cursor-provider.test.ts index 9eff5d306..0e41d9630 100644 --- a/apps/server/tests/unit/providers/cursor-provider.test.ts +++ b/apps/server/tests/unit/providers/cursor-provider.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect } from 'vitest'; +import { describe, it, expect, beforeEach } from 'vitest'; import { CursorProvider } from '@/providers/cursor-provider.js'; describe('cursor-provider.ts', () => { @@ -36,4 +36,122 @@ describe('cursor-provider.ts', () => { expect(args).not.toContain('--resume'); }); }); + + describe('normalizeEvent - result error handling', () => { + let provider: CursorProvider; + + beforeEach(() => { + provider = Object.create(CursorProvider.prototype) as CursorProvider; + }); + + it('returns error message from resultEvent.error when is_error=true', () => { + const event = { + type: 'result', + is_error: true, + error: 'Rate limit exceeded', + result: '', + subtype: 'error', + duration_ms: 3000, + session_id: 'sess-123', + }; + + const msg = provider.normalizeEvent(event); + + expect(msg).not.toBeNull(); + expect(msg!.type).toBe('error'); + expect(msg!.error).toBe('Rate limit exceeded'); + }); + + it('falls back to resultEvent.result when error field is empty and is_error=true', () => { + const event = { + type: 'result', + is_error: true, + error: '', + result: 'Process terminated unexpectedly', + subtype: 'error', + duration_ms: 5000, + session_id: 'sess-456', + }; + + const msg = provider.normalizeEvent(event); + + expect(msg).not.toBeNull(); + expect(msg!.type).toBe('error'); + expect(msg!.error).toBe('Process terminated unexpectedly'); + }); + + it('builds diagnostic fallback when both error and result are empty and is_error=true', () => { + const event = { + type: 'result', + is_error: true, + error: '', + result: '', + subtype: 'error', + duration_ms: 5000, + session_id: 'sess-789', + }; + + const msg = provider.normalizeEvent(event); + + expect(msg).not.toBeNull(); + expect(msg!.type).toBe('error'); + // Should contain diagnostic info rather than 'Unknown error' + expect(msg!.error).toContain('5000ms'); + expect(msg!.error).toContain('sess-789'); + expect(msg!.error).not.toBe('Unknown error'); + }); + + it('preserves session_id in error message', () => { + const event = { + type: 'result', + is_error: true, + error: 'Timeout occurred', + result: '', + subtype: 'error', + duration_ms: 30000, + session_id: 'my-session-id', + }; + + const msg = provider.normalizeEvent(event); + + expect(msg!.session_id).toBe('my-session-id'); + }); + + it('uses "none" when session_id is missing from diagnostic fallback', () => { + const event = { + type: 'result', + is_error: true, + error: '', + result: '', + subtype: 'error', + duration_ms: 5000, + // session_id intentionally omitted + }; + + const msg = provider.normalizeEvent(event); + + expect(msg).not.toBeNull(); + expect(msg!.type).toBe('error'); + expect(msg!.error).toContain('none'); + expect(msg!.error).not.toContain('undefined'); + }); + + it('returns success result when is_error=false', () => { + const event = { + type: 'result', + is_error: false, + error: '', + result: 'Completed successfully', + subtype: 'success', + duration_ms: 2000, + session_id: 'sess-ok', + }; + + const msg = provider.normalizeEvent(event); + + expect(msg).not.toBeNull(); + expect(msg!.type).toBe('result'); + expect(msg!.subtype).toBe('success'); + }); + }); }); diff --git a/apps/server/tests/unit/providers/gemini-provider.test.ts b/apps/server/tests/unit/providers/gemini-provider.test.ts index c63a5a60c..9a29c765e 100644 --- a/apps/server/tests/unit/providers/gemini-provider.test.ts +++ b/apps/server/tests/unit/providers/gemini-provider.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect, beforeEach } from 'vitest'; import { GeminiProvider } from '@/providers/gemini-provider.js'; +import type { ProviderMessage } from '@automaker/types'; describe('gemini-provider.ts', () => { let provider: GeminiProvider; @@ -116,4 +117,140 @@ describe('gemini-provider.ts', () => { expect(args[modelIndex + 1]).toBe('gemini-2.5-pro'); }); }); + + describe('normalizeEvent - error handling', () => { + it('returns error from result event when status=error and error field is set', () => { + const event = { + type: 'result', + status: 'error', + error: 'Model overloaded', + session_id: 'sess-gemini-1', + stats: { duration_ms: 4000, total_tokens: 0 }, + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + expect(msg.error).toBe('Model overloaded'); + expect(msg.session_id).toBe('sess-gemini-1'); + }); + + it('builds diagnostic fallback when result event has status=error but empty error field', () => { + const event = { + type: 'result', + status: 'error', + error: '', + session_id: 'sess-gemini-2', + stats: { duration_ms: 7500, total_tokens: 0 }, + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + // Diagnostic info should be present instead of 'Unknown error' + expect(msg.error).toContain('7500ms'); + expect(msg.error).toContain('sess-gemini-2'); + expect(msg.error).not.toBe('Unknown error'); + }); + + it('builds fallback with "unknown" duration when stats are missing', () => { + const event = { + type: 'result', + status: 'error', + error: '', + session_id: 'sess-gemini-nostats', + // no stats field + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + expect(msg.error).toContain('unknown'); + }); + + it('returns error from standalone error event with error field set', () => { + const event = { + type: 'error', + error: 'API key invalid', + session_id: 'sess-gemini-3', + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + expect(msg.error).toBe('API key invalid'); + }); + + it('builds diagnostic fallback when standalone error event has empty error field', () => { + const event = { + type: 'error', + error: '', + session_id: 'sess-gemini-empty', + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + // Should include session_id, not just 'Unknown error' + expect(msg.error).toContain('sess-gemini-empty'); + expect(msg.error).not.toBe('Unknown error'); + }); + + it('builds fallback mentioning "none" when session_id is missing from error event', () => { + const event = { + type: 'error', + error: '', + // no session_id + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('error'); + expect(msg.error).toContain('none'); + }); + + it('uses consistent "Gemini agent failed" label for both result and error event fallbacks', () => { + const resultEvent = { + type: 'result', + status: 'error', + error: '', + session_id: 'sess-r', + stats: { duration_ms: 1000 }, + }; + const errorEvent = { + type: 'error', + error: '', + session_id: 'sess-e', + }; + + const resultMsg = provider.normalizeEvent(resultEvent) as ProviderMessage; + const errorMsg = provider.normalizeEvent(errorEvent) as ProviderMessage; + + // Both fallback messages should use the same "Gemini agent failed" prefix + expect(resultMsg.error).toContain('Gemini agent failed'); + expect(errorMsg.error).toContain('Gemini agent failed'); + }); + + it('returns success result when result event has status=success', () => { + const event = { + type: 'result', + status: 'success', + error: '', + session_id: 'sess-gemini-ok', + stats: { duration_ms: 1200, total_tokens: 500 }, + }; + + const msg = provider.normalizeEvent(event) as ProviderMessage; + + expect(msg).not.toBeNull(); + expect(msg.type).toBe('result'); + expect(msg.subtype).toBe('success'); + }); + }); }); diff --git a/apps/server/tests/unit/services/agent-executor.test.ts b/apps/server/tests/unit/services/agent-executor.test.ts index 09f12cf44..e47b1a013 100644 --- a/apps/server/tests/unit/services/agent-executor.test.ts +++ b/apps/server/tests/unit/services/agent-executor.test.ts @@ -685,6 +685,309 @@ describe('AgentExecutor', () => { await expect(executor.execute(options, callbacks)).rejects.toThrow('API rate limit exceeded'); }); + it('should throw "Unknown error" when provider stream yields error with empty message', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'error', + error: '', + session_id: 'sess-123', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + await expect(executor.execute(options, callbacks)).rejects.toThrow('Unknown error'); + }); + + it('should throw with sanitized error when provider yields ANSI-decorated error', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'error', + // ANSI color codes + "Error: " prefix that should be stripped + error: '\x1b[31mError: Connection refused\x1b[0m', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + // Should strip ANSI codes and "Error: " prefix + await expect(executor.execute(options, callbacks)).rejects.toThrow('Connection refused'); + }); + + it('should throw when result subtype is error_max_turns', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'assistant', + message: { + content: [{ type: 'text', text: 'Working on it...' }], + }, + }; + yield { + type: 'result', + subtype: 'error_max_turns', + session_id: 'sess-456', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + await expect(executor.execute(options, callbacks)).rejects.toThrow( + 'Agent execution ended with: error_max_turns' + ); + }); + + it('should throw when result subtype is error_during_execution', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'result', + subtype: 'error_during_execution', + session_id: 'sess-789', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + await expect(executor.execute(options, callbacks)).rejects.toThrow( + 'Agent execution ended with: error_during_execution' + ); + }); + + it('should throw when result subtype is error_max_structured_output_retries', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'result', + subtype: 'error_max_structured_output_retries', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + await expect(executor.execute(options, callbacks)).rejects.toThrow( + 'Agent execution ended with: error_max_structured_output_retries' + ); + }); + + it('should throw when result subtype is error_max_budget_usd', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'result', + subtype: 'error_max_budget_usd', + session_id: 'sess-budget', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + await expect(executor.execute(options, callbacks)).rejects.toThrow( + 'Agent execution ended with: error_max_budget_usd' + ); + }); + + it('should NOT throw when result subtype is success', async () => { + const executor = new AgentExecutor( + mockEventBus, + mockFeatureStateManager, + mockPlanApprovalService, + mockSettingsService + ); + + const mockProvider = { + getName: () => 'mock', + executeQuery: vi.fn().mockImplementation(function* () { + yield { + type: 'assistant', + message: { + content: [{ type: 'text', text: 'Done!' }], + }, + }; + yield { + type: 'result', + subtype: 'success', + session_id: 'sess-ok', + }; + }), + } as unknown as BaseProvider; + + const options: AgentExecutionOptions = { + workDir: '/test', + featureId: 'test-feature', + prompt: 'Test prompt', + projectPath: '/project', + abortController: new AbortController(), + provider: mockProvider, + effectiveBareModel: 'claude-sonnet-4-6', + planningMode: 'skip', + }; + + const callbacks = { + waitForApproval: vi.fn().mockResolvedValue({ approved: true }), + saveFeatureSummary: vi.fn(), + updateFeatureSummary: vi.fn(), + buildTaskPrompt: vi.fn().mockReturnValue('task prompt'), + }; + + // Should resolve without throwing + const result = await executor.execute(options, callbacks); + expect(result.aborted).toBe(false); + expect(result.responseText).toContain('Done!'); + }); + it('should throw error when authentication fails in response', async () => { const executor = new AgentExecutor( mockEventBus, diff --git a/apps/ui/playwright.config.ts b/apps/ui/playwright.config.ts index 813a80178..36599a3d8 100644 --- a/apps/ui/playwright.config.ts +++ b/apps/ui/playwright.config.ts @@ -1,7 +1,7 @@ import { defineConfig, devices } from '@playwright/test'; -const port = process.env.TEST_PORT || 3007; -const serverPort = process.env.TEST_SERVER_PORT || 3008; +const port = process.env.TEST_PORT || 3107; +const serverPort = process.env.TEST_SERVER_PORT || 3108; const reuseServer = process.env.TEST_REUSE_SERVER === 'true'; const useExternalBackend = !!process.env.VITE_SERVER_URL; // Always use mock agent for tests (disables rate limiting, uses mock Claude responses) diff --git a/apps/ui/scripts/kill-test-servers.mjs b/apps/ui/scripts/kill-test-servers.mjs index b24d608c4..9597b3921 100644 --- a/apps/ui/scripts/kill-test-servers.mjs +++ b/apps/ui/scripts/kill-test-servers.mjs @@ -8,8 +8,8 @@ import { promisify } from 'util'; const execAsync = promisify(exec); -const SERVER_PORT = process.env.TEST_SERVER_PORT || 3008; -const UI_PORT = process.env.TEST_PORT || 3007; +const SERVER_PORT = process.env.TEST_SERVER_PORT || 3108; +const UI_PORT = process.env.TEST_PORT || 3107; const USE_EXTERNAL_SERVER = !!process.env.VITE_SERVER_URL; async function killProcessOnPort(port) { diff --git a/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx b/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx index b62edfc1a..5e995e65a 100644 --- a/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx +++ b/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx @@ -28,7 +28,11 @@ import { cn } from '@/lib/utils'; import { modelSupportsThinking } from '@/lib/utils'; import { useAppStore, ThinkingLevel, FeatureImage, PlanningMode, Feature } from '@/store/app-store'; import type { ReasoningEffort, PhaseModelEntry, AgentModel } from '@automaker/types'; -import { supportsReasoningEffort, normalizeThinkingLevelForModel } from '@automaker/types'; +import { + supportsReasoningEffort, + normalizeThinkingLevelForModel, + getThinkingLevelsForModel, +} from '@automaker/types'; import { PrioritySelector, WorkModeSelector, diff --git a/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx b/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx index d8bda267e..3c50eb295 100644 --- a/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx +++ b/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx @@ -1017,6 +1017,7 @@ export function PhaseModelSelector({ {/* Secondary zone: expand reasoning effort popover */} { if (!isOpen) { setExpandedCodexModel(null); @@ -1409,7 +1410,9 @@ export function PhaseModelSelector({ return (