Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gemini/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"theme": "Default"
}
2 changes: 1 addition & 1 deletion conf/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ survey:
# LLM CONFIGURATION
# ============================================================================
# Large Language Model provider settings
llm_provider: "aws" # Provider: "openai" or "aws" or "anthropic"
llm_provider: "openai" # Provider: "openai" or "aws" or "anthropic"

# ============================================================================
# GENERAL SETTINGS
Expand Down
15 changes: 13 additions & 2 deletions src/simulated_web_agent/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from . import context, gpt
from .gpt import async_chat, load_prompt
from .memory import Action, Memory, MemoryPiece, Observation, Plan, Reflection, Thought
from ..main.cost_calculator import calculate_cost

PERCEIVE_PROMPT = load_prompt("perceive")
REFLECT_PROMPT = load_prompt("reflect")
Expand All @@ -27,11 +28,18 @@ def __enter__(self):
self.retrieve_result = []
self.request = []
self.response = []
self.usage = None # Token usage information
self.cost = 0.0 # Cost of this API call
self.model = None # Model used for this call
self.start_time = time.time()
context.api_call_manager.set(self)

def __exit__(self, exc_type, exc_value, traceback):
context.api_call_manager.set(None)
# Calculate cost if usage information is available
if self.usage and self.model:
self.cost = calculate_cost(self.model, self.usage)

with open(
context.run_path.get()
/ "api_trace"
Expand All @@ -45,6 +53,9 @@ def __exit__(self, exc_type, exc_value, traceback):
"method_name": self.method_name,
"retrieve_result": self.retrieve_result,
"time": time.time() - self.start_time,
"usage": self.usage,
"cost": self.cost,
"model": self.model,
},
f,
)
Expand Down Expand Up @@ -318,8 +329,8 @@ async def act(self, env):
memories = self.format_memories(memories)
assert self.current_plan is not None
clickables = [e for e in env["clickable_elements"] if e is not None]
inputs = [e for e in env["clickable_elements"] if e is not None]
selects = [e for e in env["clickable_elements"] if e is not None]
inputs = [e["id"] for e in env.get("input_elements", []) if e is not None]
selects = [e["id"] for e in env.get("select_elements", []) if e is not None]
action = await async_chat(
[
{"role": "system", "content": ACTION_PROMPT},
Expand Down
11 changes: 10 additions & 1 deletion src/simulated_web_agent/agent/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
"model_name": "openai",
"litellm_params": {
"model": "openai/gpt-5-mini",
"reasoning_effort": "minimal",
},
},
{
Expand Down Expand Up @@ -272,6 +271,16 @@ async def async_chat(

if context.api_call_manager.get() and log:
context.api_call_manager.get().response.append(content)
# Extract and store usage information
if hasattr(response, 'usage'):
context.api_call_manager.get().usage = {
'prompt_tokens': response.usage.prompt_tokens,
'completion_tokens': response.usage.completion_tokens,
'total_tokens': response.usage.total_tokens,
}
# Store the model name
if hasattr(response, 'model'):
context.api_call_manager.get().model = response.model

if json_mode:
# Extract JSON substring from the content
Expand Down
74 changes: 46 additions & 28 deletions src/simulated_web_agent/executor/parser/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@ const parse = () => {

const ALLOWED_ATTR = new Set([
'id', 'type', 'name', 'value', 'placeholder',
'checked', 'disabled', 'readonly', 'required', 'maxlength',
'min', 'max', 'step', 'role', 'tabindex', 'alt', 'title',
'for', 'action', 'method', 'contenteditable', 'selected',
'multiple', 'autocomplete'
'checked', 'disabled', 'readonly', 'alt', 'title',
'for', 'contenteditable', 'selected', 'multiple'
]);

const PRESERVE_EMPTY_TAGS = new Set([
Expand All @@ -30,8 +28,8 @@ const parse = () => {
for (const a of src.attributes) {
if (
ALLOWED_ATTR.has(a.name) ||
a.name.startsWith('aria-') ||
a.name.startsWith('parser-')
(a.name.startsWith('aria-') && a.name === 'aria-label') ||
(a.name.startsWith('parser-') && (a.name === 'parser-clickable' || a.name === 'parser-semantic-id'))
) {
dst.setAttribute(a.name, a.value);
}
Expand Down Expand Up @@ -105,11 +103,48 @@ const parse = () => {
parent.innerHTML = child.innerHTML;
};

const unwrapUselessSpans = (el) => {
// Recursively unwrap span tags that don't have parser-semantic-id
// This reduces HTML size significantly without losing functionality
const walker = document.createTreeWalker(el, NodeFilter.SHOW_ELEMENT);
const spansToUnwrap = [];
let node = walker.currentNode;

while (node) {
if (node.tagName.toLowerCase() === 'span' && !node.hasAttribute('parser-semantic-id')) {
spansToUnwrap.push(node);
}
node = walker.nextNode();
}

// Unwrap in reverse order to avoid node reference issues
for (let i = spansToUnwrap.length - 1; i >= 0; i--) {
const span = spansToUnwrap[i];
const parent = span.parentNode;
if (parent) {
while (span.firstChild) {
parent.insertBefore(span.firstChild, span);
}
parent.removeChild(span);
}
}

return el;
};

const flatten = (el) => {
while (el.children.length === 1) {
const child = el.children[0];
const p = el.tagName.toLowerCase();
const c = child.tagName.toLowerCase();

// Keep only one child if tags are the same (e.g., span > span, div > div)
if (p === c && p !== 'body' && p !== 'html' && p !== 'head' && p !== 'title') {
pullUpChild(el, child);
continue;
}

// Original logic for div handling
if (p !== 'div' && c !== 'div') break;
el = (p === 'div' && c !== 'div')
? replaceElement(el, child.tagName, child)
Expand Down Expand Up @@ -151,12 +186,6 @@ const parse = () => {
copyAllowed(original, clone);

const computedStyle = window.getComputedStyle(original);
if (computedStyle.pointerEvents !== 'auto') {
clone.setAttribute('parser-pointer-events', computedStyle.pointerEvents);
}
if (document.activeElement === original) {
clone.setAttribute('parser-is-focused', 'true');
}

const isDisabled = original.disabled ||
original.hasAttribute('disabled') ||
Expand Down Expand Up @@ -203,17 +232,8 @@ const parse = () => {
if (!inputIsDisabled && thisName) {
clone.setAttribute('parser-semantic-id', thisName);
clone.setAttribute('value', original.value || '');
clone.setAttribute('parser-input-disabled', 'false');
clone.setAttribute('parser-can-edit', !original.readOnly ? 'true' : 'false');
original.setAttribute('parser-semantic-id', thisName);
}
if (!inputIsDisabled && thisName && t === 'number') {
clone.setAttribute('parser-numeric-value', original.valueAsNumber || '');
}
if (!inputIsDisabled && thisName && original.selectionStart !== undefined) {
clone.setAttribute('parser-selection-start', original.selectionStart);
clone.setAttribute('parser-selection-end', original.selectionEnd);
}
}

if (tag === 'select') {
Expand All @@ -224,17 +244,11 @@ const parse = () => {
thisName = uniqueName(parentName ? `${parentName}.${base}` : base);
}
clone.setAttribute('parser-semantic-id', thisName);
clone.setAttribute('parser-value', original.value);
clone.setAttribute('parser-selected-index', original.selectedIndex);
clone.setAttribute('parser-has-multiple', original.multiple ? 'true' : 'false');
const selectedOptions = Array.from(original.selectedOptions).map(opt => opt.value).join(',');
clone.setAttribute('parser-selected-values', selectedOptions);
original.setAttribute('parser-semantic-id', thisName);
for (const opt of original.querySelectorAll('option')) {
const o = document.createElement('option');
o.textContent = opt.textContent.trim();
o.setAttribute('value', opt.value);
o.setAttribute('parser-selected', opt.selected ? 'true' : 'false');
const optName = uniqueName(`${thisName}.${slug(opt.textContent)}`);
o.setAttribute('parser-semantic-id', optName);
opt.setAttribute('parser-semantic-id', optName);
Expand Down Expand Up @@ -268,10 +282,14 @@ const parse = () => {
}
}

// Unwrap useless spans only at the top level (avoid calling during recursion)
// This will be called once on the root element
return clone;
}

const result = automaticStripElement(document.documentElement);
let result = automaticStripElement(document.documentElement);
// Unwrap all span tags without semantic-id to reduce HTML size
result = unwrapUselessSpans(result);
return {
html: result.outerHTML,
clickable_elements: Array.from(result.querySelectorAll('[parser-clickable="true"]'))
Expand Down
133 changes: 133 additions & 0 deletions src/simulated_web_agent/main/cost_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
Cost calculator for LLM API calls.
Provides pricing information for different models and calculates costs.
"""

# Model pricing in USD per 1M tokens
# Last updated: 2024-12-05
# Source: Official pricing pages of each provider
MODEL_PRICING = {
# OpenAI models
"openai/gpt-4o-mini": {
"input": 0.15, # $0.15 per 1M input tokens
"output": 0.60, # $0.60 per 1M output tokens
},
"gpt-4o-mini": {
"input": 0.15,
"output": 0.60,
},
"openai/gpt-5": {
"input": 15.00, # $15 per 1M input tokens
"output": 60.00, # $60 per 1M output tokens
},
"gpt-5": {
"input": 15.00,
"output": 60.00,
},
"openai/gpt-5-mini": {
"input": 0.15,
"output": 0.60,
},
"gpt-5-mini": {
"input": 0.15,
"output": 0.60,
},
# Anthropic Claude models
"claude-sonnet-4-20250514": {
"input": 3.00, # $3 per 1M input tokens
"output": 15.00, # $15 per 1M output tokens
},
"claude-sonnet-4": {
"input": 3.00,
"output": 15.00,
},
"bedrock/global.anthropic.claude-sonnet-4-5-20250929-v1:0": {
"input": 3.00,
"output": 15.00,
},
"bedrock/claude-sonnet-4": {
"input": 3.00,
"output": 15.00,
},
# AWS Bedrock Claude models
"bedrock/global.anthropic.claude-haiku-4-5-20251001-v1:0": {
"input": 0.80, # $0.80 per 1M input tokens
"output": 4.00, # $4 per 1M output tokens
},
"claude-haiku-4-5": {
"input": 0.80,
"output": 4.00,
},
# Embedding models
"openai/text-embedding-3-small": {
"input": 0.02, # $0.02 per 1M tokens
"output": 0.00, # No output tokens for embeddings
},
"text-embedding-3-small": {
"input": 0.02,
"output": 0.00,
},
"bedrock/cohere.embed-english-v3": {
"input": 0.10,
"output": 0.00,
},
}


def get_pricing(model_name: str) -> dict:
"""
Get pricing information for a model.

Args:
model_name: Name of the model

Returns:
Dictionary with 'input' and 'output' keys containing prices per 1M tokens
"""
# Try exact match first
if model_name in MODEL_PRICING:
return MODEL_PRICING[model_name]

# Try to find partial match (e.g., "gpt-4o-mini" in "openai/gpt-4o-mini")
for model_key in MODEL_PRICING:
if model_name in model_key or model_key in model_name:
return MODEL_PRICING[model_key]

# Default pricing (fallback)
print(f"Warning: Pricing not found for model '{model_name}', using default pricing")
return {"input": 1.00, "output": 1.00}


def calculate_cost(model_name: str, usage: dict) -> float:
"""
Calculate the cost of an API call.

Args:
model_name: Name of the model used
usage: Dictionary with 'prompt_tokens' and 'completion_tokens'

Returns:
Cost in USD
"""
if not usage:
return 0.0

pricing = get_pricing(model_name)
prompt_tokens = usage.get('prompt_tokens', 0)
completion_tokens = usage.get('completion_tokens', 0)

# Calculate cost: tokens * (price per 1M tokens) / 1,000,000
input_cost = prompt_tokens * pricing['input'] / 1_000_000
output_cost = completion_tokens * pricing['output'] / 1_000_000

return input_cost + output_cost


def format_cost(cost: float) -> str:
"""Format cost as a string with appropriate precision."""
if cost < 0.001:
return f"${cost:.6f}"
elif cost < 0.01:
return f"${cost:.5f}"
else:
return f"${cost:.4f}"
Loading