Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gemini/commands/fix-behavioral-eval.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ You are an expert at fixing behavioral evaluations.
the same scenario. We don't want to lose test fidelity by making the prompts too
direct (i.e.: easy).
- Your primary mechanism for improving the agent's behavior is to make changes to
tool instructions, prompt.ts, and/or modules that contribute to the prompt.
tool instructions, system prompt (snippets.ts), and/or modules that contribute to the prompt.
- If prompt and description changes are unsuccessful, use logs and debugging to
confirm that everything is working as expected.
- If unable to fix the test, you can make recommendations for architecture changes
Expand Down
110 changes: 110 additions & 0 deletions evals/edit-locations-eval.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('Edits location eval', () => {
/**
* Ensure that Gemini CLI always updates existing test files, if present,
* instead of creating a new one.
*/
evalTest('USUALLY_PASSES', {
name: 'should update existing test file instead of creating a new one',
files: {
'package.json': JSON.stringify(
{
name: 'test-location-repro',
version: '1.0.0',
scripts: {
test: 'vitest run',
},
devDependencies: {
vitest: '^1.0.0',
typescript: '^5.0.0',
},
},
null,
2,
),
'src/math.ts': `
export function add(a: number, b: number): number {
return a + b;
}

export function subtract(a: number, b: number): number {
return a - b;
}

export function multiply(a: number, b: number): number {
return a + b;
}
`,
'src/math.test.ts': `
import { expect, test } from 'vitest';
import { add, subtract } from './math';

test('add adds two numbers', () => {
expect(add(2, 3)).toBe(5);
});

test('subtract subtracts two numbers', () => {
expect(subtract(5, 3)).toBe(2);
});
`,
'src/utils.ts': `
export function capitalize(s: string): string {
return s.charAt(0).toUpperCase() + s.slice(1);
}
`,
'src/utils.test.ts': `
import { expect, test } from 'vitest';
import { capitalize } from './utils';

test('capitalize capitalizes the first letter', () => {
expect(capitalize('hello')).toBe('Hello');
});
`,
},
prompt: 'Fix the bug in src/math.ts. Do not run the code.',
timeout: 180000,
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const replaceCalls = toolLogs.filter(
(t) => t.toolRequest.name === 'replace',
);
const writeFileCalls = toolLogs.filter(
(t) => t.toolRequest.name === 'write_file',
);

expect(replaceCalls.length).toBeGreaterThan(0);
expect(
writeFileCalls.some((file) =>
file.toolRequest.args.includes('.test.ts'),
),
).toBe(false);

const targetFiles = replaceCalls.map((t) => {
try {
return JSON.parse(t.toolRequest.args).file_path;
} catch {
return null;
}
});

console.log('DEBUG: targetFiles', targetFiles);

expect(
new Set(targetFiles).size,
'Expected only two files changed',
).greaterThanOrEqual(2);
Comment on lines +100 to +103
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The assertion message 'Expected only two files changed' suggests an exact check, but greaterThanOrEqual(2) allows for more than two files to be changed. To make the test stricter and align with the stated expectation, you should check for an exact size of 2.

      expect(new Set(targetFiles).size, 'Expected only two files changed').toBe(2);

expect(targetFiles.some((f) => f?.endsWith('src/math.ts'))).toBe(true);
expect(targetFiles.some((f) => f?.endsWith('src/math.test.ts'))).toBe(
true,
);
},
});
});
Loading
Loading