Skip to content

Commit e52cd96

Browse files
committed
fix(evals): merge tool usage across task instances instead of ignoring
Instead of ignoring TaskTokenUsageUpdated events after TaskAborted, accumulate tool usage data using a MAX strategy. This ensures: - Empty rehydrated data won't overwrite existing: max(5, 0) = 5 - Legitimate restart with additional work is captured: max(5, 8) = 8 This approach is more robust than simply ignoring post-abort events, as it handles both spurious rehydration and legitimate restart scenarios.
1 parent 9fbb07f commit e52cd96

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

packages/evals/src/cli/runTask.ts

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
RooCodeEventName,
1414
IpcMessageType,
1515
EVALS_SETTINGS,
16+
type ToolUsage,
1617
} from "@roo-code/types"
1718
import { IpcClient } from "@roo-code/ipc"
1819

@@ -277,6 +278,8 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
277278
let taskMetricsId: number | undefined
278279
let rooTaskId: string | undefined
279280
let isClientDisconnected = false
281+
// Track accumulated tool usage across task instances (handles rehydration after abort)
282+
const accumulatedToolUsage: ToolUsage = {}
280283

281284
const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
282285
broadcast: [RooCodeEventName.Message],
@@ -364,21 +367,35 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
364367
await createToolError({ taskId: task.id, toolName, error })
365368
}
366369

367-
// After TaskAborted, ignore any further TaskTokenUsageUpdated events.
368-
// This prevents a rehydrated task instance (which has empty toolUsage)
369-
// from overwriting the final metrics that were saved before abort.
370370
if (
371371
(eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) &&
372-
taskMetricsId &&
373-
!taskAbortedAt
372+
taskMetricsId
374373
) {
375374
const duration = Date.now() - taskStartedAt
376375

377376
const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } =
378377
payload[1]
379378

380379
// For both TaskTokenUsageUpdated and TaskCompleted: toolUsage is payload[2]
381-
const toolUsage = payload[2]
380+
const incomingToolUsage: ToolUsage = payload[2] ?? {}
381+
382+
// Merge incoming tool usage with accumulated data using MAX strategy.
383+
// This handles the case where a task is rehydrated after abort:
384+
// - Empty rehydrated data won't overwrite existing: max(5, 0) = 5
385+
// - Legitimate restart with additional work is captured: max(5, 8) = 8
386+
// Each task instance tracks its own cumulative values, so we take the max
387+
// to preserve the highest values seen across all instances.
388+
for (const [toolName, usage] of Object.entries(incomingToolUsage)) {
389+
const existing = accumulatedToolUsage[toolName as keyof ToolUsage]
390+
if (existing) {
391+
accumulatedToolUsage[toolName as keyof ToolUsage] = {
392+
attempts: Math.max(existing.attempts, usage.attempts),
393+
failures: Math.max(existing.failures, usage.failures),
394+
}
395+
} else {
396+
accumulatedToolUsage[toolName as keyof ToolUsage] = { ...usage }
397+
}
398+
}
382399

383400
await updateTaskMetrics(taskMetricsId, {
384401
cost: totalCost,
@@ -388,7 +405,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
388405
duration,
389406
cacheWrites: totalCacheWrites ?? 0,
390407
cacheReads: totalCacheReads ?? 0,
391-
toolUsage, // Now included in every update
408+
toolUsage: accumulatedToolUsage,
392409
})
393410
}
394411

0 commit comments

Comments
 (0)