Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions evals/test-helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
// bootstrap test projects.
const rootNodeModules = path.join(process.cwd(), 'node_modules');
const testNodeModules = path.join(rig.testDir || '', 'node_modules');
if (fs.existsSync(rootNodeModules)) {
if (fs.existsSync(rootNodeModules) && !fs.existsSync(testNodeModules)) {
fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
}

Expand Down Expand Up @@ -162,7 +162,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(evalCase.name, fn);
} else {
it(evalCase.name, fn);
it(evalCase.name, fn, evalCase.timeout);
}
}

Expand Down
85 changes: 85 additions & 0 deletions evals/validation_fidelity.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('validation_fidelity', () => {
evalTest('ALWAYS_PASSES', {
name: 'should perform exhaustive validation autonomously when guided by system instructions',
files: {
'src/types.ts': `
export interface LogEntry {
level: 'info' | 'warn' | 'error';
message: string;
}
`,
'src/logger.ts': `
import { LogEntry } from './types.js';

export function formatLog(entry: LogEntry): string {
return \`[\${entry.level.toUpperCase()}] \${entry.message}\`;
}
`,
'src/logger.test.ts': `
import { expect, test } from 'vitest';
import { formatLog } from './logger.js';
import { LogEntry } from './types.js';

test('formats log correctly', () => {
const entry: LogEntry = { level: 'info', message: 'test message' };
expect(formatLog(entry)).toBe('[INFO] test message');
});
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
build: 'tsc --noEmit',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: {
target: 'ESNext',
module: 'ESNext',
moduleResolution: 'node',
strict: true,
esModuleInterop: true,
skipLibCheck: true,
forceConsistentCasingInFileNames: true,
},
}),
},
prompt:
"Refactor the 'LogEntry' interface in 'src/types.ts' to rename the 'message' field to 'payload'.",
timeout: 600000,
assert: async (rig) => {
// The goal of this eval is to see if the agent realizes it needs to update usages
// AND run 'npm run build' or 'tsc' autonomously to ensure project-wide structural integrity.

const toolLogs = rig.readToolLogs();
const shellCalls = toolLogs.filter(
(log) => log.toolRequest.name === 'run_shell_command',
);

const hasBuildOrTsc = shellCalls.some((log) => {
const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase();
return (
cmd.includes('npm run build') ||
cmd.includes('tsc') ||
cmd.includes('typecheck') ||
cmd.includes('npm run verify')
);
});

expect(
hasBuildOrTsc,
'Expected the agent to autonomously run a build or type-check command to verify the refactoring',
).toBe(true);
},
});
});
79 changes: 79 additions & 0 deletions evals/validation_fidelity_pre_existing_errors.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('validation_fidelity_pre_existing_errors', () => {
evalTest('ALWAYS_PASSES', {
name: 'should handle pre-existing project errors gracefully during validation',
files: {
'src/math.ts': `
export function add(a: number, b: number): number {
return a + b;
}
`,
'src/index.ts': `
import { add } from './math.js';
console.log(add(1, 2));
`,
'src/utils.ts': `
export function multiply(a: number, b: number): number {
return a * c; // 'c' is not defined - PRE-EXISTING ERROR
}
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
build: 'tsc --noEmit',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: {
target: 'ESNext',
module: 'ESNext',
moduleResolution: 'node',
strict: true,
esModuleInterop: true,
skipLibCheck: true,
forceConsistentCasingInFileNames: true,
},
}),
},
prompt: "In src/math.ts, rename the 'add' function to 'sum'.",
timeout: 600000,
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const replaceCalls = toolLogs.filter(
(log) => log.toolRequest.name === 'replace',
);

// Verify it did the work in math.ts
const mathRefactor = replaceCalls.some((log) => {
const args = JSON.parse(log.toolRequest.args);
return (
args.file_path.endsWith('src/math.ts') &&
args.new_string.includes('sum')
);
});
expect(mathRefactor, 'Agent should have refactored math.ts').toBe(true);

const shellCalls = toolLogs.filter(
(log) => log.toolRequest.name === 'run_shell_command',
);
const ranValidation = shellCalls.some((log) => {
const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase();
return cmd.includes('build') || cmd.includes('tsc');
});

expect(ranValidation, 'Agent should have attempted validation').toBe(
true,
);
},
});
});
Loading
Loading