neuhai · eason9338 · Dec 12, 2025
diff --git a/.gemini/settings.json b/.gemini/settings.json
@@ -0,0 +1,3 @@
+{
+  "theme": "Default"
+}
diff --git a/conf/base.yaml b/conf/base.yaml
@@ -92,7 +92,7 @@ survey:
 # LLM CONFIGURATION
 # ============================================================================
 # Large Language Model provider settings
-llm_provider: "aws"     # Provider: "openai" or "aws" or "anthropic"
+llm_provider: "openai"     # Provider: "openai" or "aws" or "anthropic"
 
 # ============================================================================
 # GENERAL SETTINGS

diff --git a/src/simulated_web_agent/agent/agent.py b/src/simulated_web_agent/agent/agent.py
@@ -8,6 +8,7 @@
 from . import context, gpt
 from .gpt import async_chat, load_prompt
 from .memory import Action, Memory, MemoryPiece, Observation, Plan, Reflection, Thought
+from ..main.cost_calculator import calculate_cost
 
 PERCEIVE_PROMPT = load_prompt("perceive")
 REFLECT_PROMPT = load_prompt("reflect")
@@ -27,11 +28,18 @@ def __enter__(self):
         self.retrieve_result = []
         self.request = []
         self.response = []
+        self.usage = None  # Token usage information
+        self.cost = 0.0    # Cost of this API call
+        self.model = None  # Model used for this call
         self.start_time = time.time()
         context.api_call_manager.set(self)
 
     def __exit__(self, exc_type, exc_value, traceback):
         context.api_call_manager.set(None)
+        # Calculate cost if usage information is available
+        if self.usage and self.model:
+            self.cost = calculate_cost(self.model, self.usage)
+
         with open(
             context.run_path.get()
             / "api_trace"
@@ -45,6 +53,9 @@ def __exit__(self, exc_type, exc_value, traceback):
                     "method_name": self.method_name,
                     "retrieve_result": self.retrieve_result,
                     "time": time.time() - self.start_time,
+                    "usage": self.usage,
+                    "cost": self.cost,
+                    "model": self.model,
                 },
                 f,
             )
@@ -318,8 +329,8 @@ async def act(self, env):
             memories = self.format_memories(memories)
             assert self.current_plan is not None
             clickables = [e for e in env["clickable_elements"] if e is not None]
-            inputs = [e for e in env["clickable_elements"] if e is not None]
-            selects = [e for e in env["clickable_elements"] if e is not None]
+            inputs = [e["id"] for e in env.get("input_elements", []) if e is not None]
+            selects = [e["id"] for e in env.get("select_elements", []) if e is not None]
             action = await async_chat(
                 [
                     {"role": "system", "content": ACTION_PROMPT},

diff --git a/src/simulated_web_agent/agent/gpt.py b/src/simulated_web_agent/agent/gpt.py
@@ -32,7 +32,6 @@
             "model_name": "openai",
             "litellm_params": {
                 "model": "openai/gpt-5-mini",
-                "reasoning_effort": "minimal",
             },
         },
         {
@@ -272,6 +271,16 @@ async def async_chat(
 
     if context.api_call_manager.get() and log:
         context.api_call_manager.get().response.append(content)
+        # Extract and store usage information
+        if hasattr(response, 'usage'):
+            context.api_call_manager.get().usage = {
+                'prompt_tokens': response.usage.prompt_tokens,
+                'completion_tokens': response.usage.completion_tokens,
+                'total_tokens': response.usage.total_tokens,
+            }
+        # Store the model name
+        if hasattr(response, 'model'):
+            context.api_call_manager.get().model = response.model
 
     if json_mode:
         # Extract JSON substring from the content

diff --git a/src/simulated_web_agent/executor/parser/parser.js b/src/simulated_web_agent/executor/parser/parser.js
@@ -13,10 +13,8 @@ const parse = () => {
 
   const ALLOWED_ATTR = new Set([
     'id', 'type', 'name', 'value', 'placeholder',
-    'checked', 'disabled', 'readonly', 'required', 'maxlength',
-    'min', 'max', 'step', 'role', 'tabindex', 'alt', 'title',
-    'for', 'action', 'method', 'contenteditable', 'selected',
-    'multiple', 'autocomplete'
+    'checked', 'disabled', 'readonly', 'alt', 'title',
+    'for', 'contenteditable', 'selected', 'multiple'
   ]);
 
   const PRESERVE_EMPTY_TAGS = new Set([
@@ -30,8 +28,8 @@ const parse = () => {
     for (const a of src.attributes) {
       if (
         ALLOWED_ATTR.has(a.name) ||
-        a.name.startsWith('aria-') ||
-        a.name.startsWith('parser-')
+        (a.name.startsWith('aria-') && a.name === 'aria-label') ||
+        (a.name.startsWith('parser-') && (a.name === 'parser-clickable' || a.name === 'parser-semantic-id'))
       ) {
         dst.setAttribute(a.name, a.value);
       }
@@ -105,11 +103,48 @@ const parse = () => {
     parent.innerHTML = child.innerHTML;
   };
 
+  const unwrapUselessSpans = (el) => {
+    // Recursively unwrap span tags that don't have parser-semantic-id
+    // This reduces HTML size significantly without losing functionality
+    const walker = document.createTreeWalker(el, NodeFilter.SHOW_ELEMENT);
+    const spansToUnwrap = [];
+    let node = walker.currentNode;
+
+    while (node) {
+      if (node.tagName.toLowerCase() === 'span' && !node.hasAttribute('parser-semantic-id')) {
+        spansToUnwrap.push(node);
+      }
+      node = walker.nextNode();
+    }
+
+    // Unwrap in reverse order to avoid node reference issues
+    for (let i = spansToUnwrap.length - 1; i >= 0; i--) {
+      const span = spansToUnwrap[i];
+      const parent = span.parentNode;
+      if (parent) {
+        while (span.firstChild) {
+          parent.insertBefore(span.firstChild, span);
+        }
+        parent.removeChild(span);
+      }
+    }
+
+    return el;
+  };
+
   const flatten = (el) => {
     while (el.children.length === 1) {
       const child = el.children[0];
       const p = el.tagName.toLowerCase();
       const c = child.tagName.toLowerCase();
+
+      // Keep only one child if tags are the same (e.g., span > span, div > div)
+      if (p === c && p !== 'body' && p !== 'html' && p !== 'head' && p !== 'title') {
+        pullUpChild(el, child);
+        continue;
+      }
+
+      // Original logic for div handling
       if (p !== 'div' && c !== 'div') break;
       el = (p === 'div' && c !== 'div')
         ? replaceElement(el, child.tagName, child)
@@ -151,12 +186,6 @@ const parse = () => {
     copyAllowed(original, clone);
 
     const computedStyle = window.getComputedStyle(original);
-    if (computedStyle.pointerEvents !== 'auto') {
-      clone.setAttribute('parser-pointer-events', computedStyle.pointerEvents);
-    }
-    if (document.activeElement === original) {
-      clone.setAttribute('parser-is-focused', 'true');
-    }
 
     const isDisabled = original.disabled ||
       original.hasAttribute('disabled') ||
@@ -203,17 +232,8 @@ const parse = () => {
       if (!inputIsDisabled && thisName) {
         clone.setAttribute('parser-semantic-id', thisName);
         clone.setAttribute('value', original.value || '');
-        clone.setAttribute('parser-input-disabled', 'false');
-        clone.setAttribute('parser-can-edit', !original.readOnly ? 'true' : 'false');
         original.setAttribute('parser-semantic-id', thisName);
       }
-      if (!inputIsDisabled && thisName && t === 'number') {
-        clone.setAttribute('parser-numeric-value', original.valueAsNumber || '');
-      }
-      if (!inputIsDisabled && thisName && original.selectionStart !== undefined) {
-        clone.setAttribute('parser-selection-start', original.selectionStart);
-        clone.setAttribute('parser-selection-end', original.selectionEnd);
-      }
     }
 
     if (tag === 'select') {
@@ -224,17 +244,11 @@ const parse = () => {
           thisName = uniqueName(parentName ? `${parentName}.${base}` : base);
         }
         clone.setAttribute('parser-semantic-id', thisName);
-        clone.setAttribute('parser-value', original.value);
-        clone.setAttribute('parser-selected-index', original.selectedIndex);
-        clone.setAttribute('parser-has-multiple', original.multiple ? 'true' : 'false');
-        const selectedOptions = Array.from(original.selectedOptions).map(opt => opt.value).join(',');
-        clone.setAttribute('parser-selected-values', selectedOptions);
         original.setAttribute('parser-semantic-id', thisName);
         for (const opt of original.querySelectorAll('option')) {
           const o = document.createElement('option');
           o.textContent = opt.textContent.trim();
           o.setAttribute('value', opt.value);
-          o.setAttribute('parser-selected', opt.selected ? 'true' : 'false');
           const optName = uniqueName(`${thisName}.${slug(opt.textContent)}`);
           o.setAttribute('parser-semantic-id', optName);
           opt.setAttribute('parser-semantic-id', optName);
@@ -268,10 +282,14 @@ const parse = () => {
       }
     }
 
+    // Unwrap useless spans only at the top level (avoid calling during recursion)
+    // This will be called once on the root element
     return clone;
   }
 
-  const result = automaticStripElement(document.documentElement);
+  let result = automaticStripElement(document.documentElement);
+  // Unwrap all span tags without semantic-id to reduce HTML size
+  result = unwrapUselessSpans(result);
   return {
     html: result.outerHTML,
     clickable_elements: Array.from(result.querySelectorAll('[parser-clickable="true"]'))

diff --git a/src/simulated_web_agent/main/cost_calculator.py b/src/simulated_web_agent/main/cost_calculator.py
@@ -0,0 +1,133 @@
+"""
+Cost calculator for LLM API calls.
+Provides pricing information for different models and calculates costs.
+"""
+
+# Model pricing in USD per 1M tokens
+# Last updated: 2024-12-05
+# Source: Official pricing pages of each provider
+MODEL_PRICING = {
+    # OpenAI models
+    "openai/gpt-4o-mini": {
+        "input": 0.15,        # $0.15 per 1M input tokens
+        "output": 0.60,       # $0.60 per 1M output tokens
+    },
+    "gpt-4o-mini": {
+        "input": 0.15,
+        "output": 0.60,
+    },
+    "openai/gpt-5": {
+        "input": 15.00,       # $15 per 1M input tokens
+        "output": 60.00,      # $60 per 1M output tokens
+    },
+    "gpt-5": {
+        "input": 15.00,
+        "output": 60.00,
+    },
+    "openai/gpt-5-mini": {
+        "input": 0.15,
+        "output": 0.60,
+    },
+    "gpt-5-mini": {
+        "input": 0.15,
+        "output": 0.60,
+    },
+    # Anthropic Claude models
+    "claude-sonnet-4-20250514": {
+        "input": 3.00,        # $3 per 1M input tokens
+        "output": 15.00,      # $15 per 1M output tokens
+    },
+    "claude-sonnet-4": {
+        "input": 3.00,
+        "output": 15.00,
+    },
+    "bedrock/global.anthropic.claude-sonnet-4-5-20250929-v1:0": {
+        "input": 3.00,
+        "output": 15.00,
+    },
+    "bedrock/claude-sonnet-4": {
+        "input": 3.00,
+        "output": 15.00,
+    },
+    # AWS Bedrock Claude models
+    "bedrock/global.anthropic.claude-haiku-4-5-20251001-v1:0": {
+        "input": 0.80,        # $0.80 per 1M input tokens
+        "output": 4.00,       # $4 per 1M output tokens
+    },
+    "claude-haiku-4-5": {
+        "input": 0.80,
+        "output": 4.00,
+    },
+    # Embedding models
+    "openai/text-embedding-3-small": {
+        "input": 0.02,        # $0.02 per 1M tokens
+        "output": 0.00,       # No output tokens for embeddings
+    },
+    "text-embedding-3-small": {
+        "input": 0.02,
+        "output": 0.00,
+    },
+    "bedrock/cohere.embed-english-v3": {
+        "input": 0.10,
+        "output": 0.00,
+    },
+}
+
+
+def get_pricing(model_name: str) -> dict:
+    """
+    Get pricing information for a model.
+
+    Args:
+        model_name: Name of the model
+
+    Returns:
+        Dictionary with 'input' and 'output' keys containing prices per 1M tokens
+    """
+    # Try exact match first
+    if model_name in MODEL_PRICING:
+        return MODEL_PRICING[model_name]
+
+    # Try to find partial match (e.g., "gpt-4o-mini" in "openai/gpt-4o-mini")
+    for model_key in MODEL_PRICING:
+        if model_name in model_key or model_key in model_name:
+            return MODEL_PRICING[model_key]
+
+    # Default pricing (fallback)
+    print(f"Warning: Pricing not found for model '{model_name}', using default pricing")
+    return {"input": 1.00, "output": 1.00}
+
+
+def calculate_cost(model_name: str, usage: dict) -> float:
+    """
+    Calculate the cost of an API call.
+
+    Args:
+        model_name: Name of the model used
+        usage: Dictionary with 'prompt_tokens' and 'completion_tokens'
+
+    Returns:
+        Cost in USD
+    """
+    if not usage:
+        return 0.0
+
+    pricing = get_pricing(model_name)
+    prompt_tokens = usage.get('prompt_tokens', 0)
+    completion_tokens = usage.get('completion_tokens', 0)
+
+    # Calculate cost: tokens * (price per 1M tokens) / 1,000,000
+    input_cost = prompt_tokens * pricing['input'] / 1_000_000
+    output_cost = completion_tokens * pricing['output'] / 1_000_000
+
+    return input_cost + output_cost
+
+
+def format_cost(cost: float) -> str:
+    """Format cost as a string with appropriate precision."""
+    if cost < 0.001:
+        return f"${cost:.6f}"
+    elif cost < 0.01:
+        return f"${cost:.5f}"
+    else:
+        return f"${cost:.4f}"