diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 0000000..9532e2c --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,3 @@ +{ + "theme": "Default" +} \ No newline at end of file diff --git a/conf/base.yaml b/conf/base.yaml index 4c83152..4b928e5 100644 --- a/conf/base.yaml +++ b/conf/base.yaml @@ -92,7 +92,7 @@ survey: # LLM CONFIGURATION # ============================================================================ # Large Language Model provider settings -llm_provider: "aws" # Provider: "openai" or "aws" or "anthropic" +llm_provider: "openai" # Provider: "openai" or "aws" or "anthropic" # ============================================================================ # GENERAL SETTINGS diff --git a/src/simulated_web_agent/agent/agent.py b/src/simulated_web_agent/agent/agent.py index 4d5b103..bd0b00d 100644 --- a/src/simulated_web_agent/agent/agent.py +++ b/src/simulated_web_agent/agent/agent.py @@ -8,6 +8,7 @@ from . import context, gpt from .gpt import async_chat, load_prompt from .memory import Action, Memory, MemoryPiece, Observation, Plan, Reflection, Thought +from ..main.cost_calculator import calculate_cost PERCEIVE_PROMPT = load_prompt("perceive") REFLECT_PROMPT = load_prompt("reflect") @@ -27,11 +28,18 @@ def __enter__(self): self.retrieve_result = [] self.request = [] self.response = [] + self.usage = None # Token usage information + self.cost = 0.0 # Cost of this API call + self.model = None # Model used for this call self.start_time = time.time() context.api_call_manager.set(self) def __exit__(self, exc_type, exc_value, traceback): context.api_call_manager.set(None) + # Calculate cost if usage information is available + if self.usage and self.model: + self.cost = calculate_cost(self.model, self.usage) + with open( context.run_path.get() / "api_trace" @@ -45,6 +53,9 @@ def __exit__(self, exc_type, exc_value, traceback): "method_name": self.method_name, "retrieve_result": self.retrieve_result, "time": time.time() - self.start_time, + "usage": self.usage, + "cost": self.cost, + "model": self.model, }, f, ) @@ -318,8 +329,8 @@ async def act(self, env): memories = self.format_memories(memories) assert self.current_plan is not None clickables = [e for e in env["clickable_elements"] if e is not None] - inputs = [e for e in env["clickable_elements"] if e is not None] - selects = [e for e in env["clickable_elements"] if e is not None] + inputs = [e["id"] for e in env.get("input_elements", []) if e is not None] + selects = [e["id"] for e in env.get("select_elements", []) if e is not None] action = await async_chat( [ {"role": "system", "content": ACTION_PROMPT}, diff --git a/src/simulated_web_agent/agent/gpt.py b/src/simulated_web_agent/agent/gpt.py index 8e7e23a..0c8f648 100644 --- a/src/simulated_web_agent/agent/gpt.py +++ b/src/simulated_web_agent/agent/gpt.py @@ -32,7 +32,6 @@ "model_name": "openai", "litellm_params": { "model": "openai/gpt-5-mini", - "reasoning_effort": "minimal", }, }, { @@ -272,6 +271,16 @@ async def async_chat( if context.api_call_manager.get() and log: context.api_call_manager.get().response.append(content) + # Extract and store usage information + if hasattr(response, 'usage'): + context.api_call_manager.get().usage = { + 'prompt_tokens': response.usage.prompt_tokens, + 'completion_tokens': response.usage.completion_tokens, + 'total_tokens': response.usage.total_tokens, + } + # Store the model name + if hasattr(response, 'model'): + context.api_call_manager.get().model = response.model if json_mode: # Extract JSON substring from the content diff --git a/src/simulated_web_agent/executor/parser/parser.js b/src/simulated_web_agent/executor/parser/parser.js index 299c0ec..43f4fd6 100644 --- a/src/simulated_web_agent/executor/parser/parser.js +++ b/src/simulated_web_agent/executor/parser/parser.js @@ -13,10 +13,8 @@ const parse = () => { const ALLOWED_ATTR = new Set([ 'id', 'type', 'name', 'value', 'placeholder', - 'checked', 'disabled', 'readonly', 'required', 'maxlength', - 'min', 'max', 'step', 'role', 'tabindex', 'alt', 'title', - 'for', 'action', 'method', 'contenteditable', 'selected', - 'multiple', 'autocomplete' + 'checked', 'disabled', 'readonly', 'alt', 'title', + 'for', 'contenteditable', 'selected', 'multiple' ]); const PRESERVE_EMPTY_TAGS = new Set([ @@ -30,8 +28,8 @@ const parse = () => { for (const a of src.attributes) { if ( ALLOWED_ATTR.has(a.name) || - a.name.startsWith('aria-') || - a.name.startsWith('parser-') + (a.name.startsWith('aria-') && a.name === 'aria-label') || + (a.name.startsWith('parser-') && (a.name === 'parser-clickable' || a.name === 'parser-semantic-id')) ) { dst.setAttribute(a.name, a.value); } @@ -105,11 +103,48 @@ const parse = () => { parent.innerHTML = child.innerHTML; }; + const unwrapUselessSpans = (el) => { + // Recursively unwrap span tags that don't have parser-semantic-id + // This reduces HTML size significantly without losing functionality + const walker = document.createTreeWalker(el, NodeFilter.SHOW_ELEMENT); + const spansToUnwrap = []; + let node = walker.currentNode; + + while (node) { + if (node.tagName.toLowerCase() === 'span' && !node.hasAttribute('parser-semantic-id')) { + spansToUnwrap.push(node); + } + node = walker.nextNode(); + } + + // Unwrap in reverse order to avoid node reference issues + for (let i = spansToUnwrap.length - 1; i >= 0; i--) { + const span = spansToUnwrap[i]; + const parent = span.parentNode; + if (parent) { + while (span.firstChild) { + parent.insertBefore(span.firstChild, span); + } + parent.removeChild(span); + } + } + + return el; + }; + const flatten = (el) => { while (el.children.length === 1) { const child = el.children[0]; const p = el.tagName.toLowerCase(); const c = child.tagName.toLowerCase(); + + // Keep only one child if tags are the same (e.g., span > span, div > div) + if (p === c && p !== 'body' && p !== 'html' && p !== 'head' && p !== 'title') { + pullUpChild(el, child); + continue; + } + + // Original logic for div handling if (p !== 'div' && c !== 'div') break; el = (p === 'div' && c !== 'div') ? replaceElement(el, child.tagName, child) @@ -151,12 +186,6 @@ const parse = () => { copyAllowed(original, clone); const computedStyle = window.getComputedStyle(original); - if (computedStyle.pointerEvents !== 'auto') { - clone.setAttribute('parser-pointer-events', computedStyle.pointerEvents); - } - if (document.activeElement === original) { - clone.setAttribute('parser-is-focused', 'true'); - } const isDisabled = original.disabled || original.hasAttribute('disabled') || @@ -203,17 +232,8 @@ const parse = () => { if (!inputIsDisabled && thisName) { clone.setAttribute('parser-semantic-id', thisName); clone.setAttribute('value', original.value || ''); - clone.setAttribute('parser-input-disabled', 'false'); - clone.setAttribute('parser-can-edit', !original.readOnly ? 'true' : 'false'); original.setAttribute('parser-semantic-id', thisName); } - if (!inputIsDisabled && thisName && t === 'number') { - clone.setAttribute('parser-numeric-value', original.valueAsNumber || ''); - } - if (!inputIsDisabled && thisName && original.selectionStart !== undefined) { - clone.setAttribute('parser-selection-start', original.selectionStart); - clone.setAttribute('parser-selection-end', original.selectionEnd); - } } if (tag === 'select') { @@ -224,17 +244,11 @@ const parse = () => { thisName = uniqueName(parentName ? `${parentName}.${base}` : base); } clone.setAttribute('parser-semantic-id', thisName); - clone.setAttribute('parser-value', original.value); - clone.setAttribute('parser-selected-index', original.selectedIndex); - clone.setAttribute('parser-has-multiple', original.multiple ? 'true' : 'false'); - const selectedOptions = Array.from(original.selectedOptions).map(opt => opt.value).join(','); - clone.setAttribute('parser-selected-values', selectedOptions); original.setAttribute('parser-semantic-id', thisName); for (const opt of original.querySelectorAll('option')) { const o = document.createElement('option'); o.textContent = opt.textContent.trim(); o.setAttribute('value', opt.value); - o.setAttribute('parser-selected', opt.selected ? 'true' : 'false'); const optName = uniqueName(`${thisName}.${slug(opt.textContent)}`); o.setAttribute('parser-semantic-id', optName); opt.setAttribute('parser-semantic-id', optName); @@ -268,10 +282,14 @@ const parse = () => { } } + // Unwrap useless spans only at the top level (avoid calling during recursion) + // This will be called once on the root element return clone; } - const result = automaticStripElement(document.documentElement); + let result = automaticStripElement(document.documentElement); + // Unwrap all span tags without semantic-id to reduce HTML size + result = unwrapUselessSpans(result); return { html: result.outerHTML, clickable_elements: Array.from(result.querySelectorAll('[parser-clickable="true"]')) diff --git a/src/simulated_web_agent/main/cost_calculator.py b/src/simulated_web_agent/main/cost_calculator.py new file mode 100644 index 0000000..79df003 --- /dev/null +++ b/src/simulated_web_agent/main/cost_calculator.py @@ -0,0 +1,133 @@ +""" +Cost calculator for LLM API calls. +Provides pricing information for different models and calculates costs. +""" + +# Model pricing in USD per 1M tokens +# Last updated: 2024-12-05 +# Source: Official pricing pages of each provider +MODEL_PRICING = { + # OpenAI models + "openai/gpt-4o-mini": { + "input": 0.15, # $0.15 per 1M input tokens + "output": 0.60, # $0.60 per 1M output tokens + }, + "gpt-4o-mini": { + "input": 0.15, + "output": 0.60, + }, + "openai/gpt-5": { + "input": 15.00, # $15 per 1M input tokens + "output": 60.00, # $60 per 1M output tokens + }, + "gpt-5": { + "input": 15.00, + "output": 60.00, + }, + "openai/gpt-5-mini": { + "input": 0.15, + "output": 0.60, + }, + "gpt-5-mini": { + "input": 0.15, + "output": 0.60, + }, + # Anthropic Claude models + "claude-sonnet-4-20250514": { + "input": 3.00, # $3 per 1M input tokens + "output": 15.00, # $15 per 1M output tokens + }, + "claude-sonnet-4": { + "input": 3.00, + "output": 15.00, + }, + "bedrock/global.anthropic.claude-sonnet-4-5-20250929-v1:0": { + "input": 3.00, + "output": 15.00, + }, + "bedrock/claude-sonnet-4": { + "input": 3.00, + "output": 15.00, + }, + # AWS Bedrock Claude models + "bedrock/global.anthropic.claude-haiku-4-5-20251001-v1:0": { + "input": 0.80, # $0.80 per 1M input tokens + "output": 4.00, # $4 per 1M output tokens + }, + "claude-haiku-4-5": { + "input": 0.80, + "output": 4.00, + }, + # Embedding models + "openai/text-embedding-3-small": { + "input": 0.02, # $0.02 per 1M tokens + "output": 0.00, # No output tokens for embeddings + }, + "text-embedding-3-small": { + "input": 0.02, + "output": 0.00, + }, + "bedrock/cohere.embed-english-v3": { + "input": 0.10, + "output": 0.00, + }, +} + + +def get_pricing(model_name: str) -> dict: + """ + Get pricing information for a model. + + Args: + model_name: Name of the model + + Returns: + Dictionary with 'input' and 'output' keys containing prices per 1M tokens + """ + # Try exact match first + if model_name in MODEL_PRICING: + return MODEL_PRICING[model_name] + + # Try to find partial match (e.g., "gpt-4o-mini" in "openai/gpt-4o-mini") + for model_key in MODEL_PRICING: + if model_name in model_key or model_key in model_name: + return MODEL_PRICING[model_key] + + # Default pricing (fallback) + print(f"Warning: Pricing not found for model '{model_name}', using default pricing") + return {"input": 1.00, "output": 1.00} + + +def calculate_cost(model_name: str, usage: dict) -> float: + """ + Calculate the cost of an API call. + + Args: + model_name: Name of the model used + usage: Dictionary with 'prompt_tokens' and 'completion_tokens' + + Returns: + Cost in USD + """ + if not usage: + return 0.0 + + pricing = get_pricing(model_name) + prompt_tokens = usage.get('prompt_tokens', 0) + completion_tokens = usage.get('completion_tokens', 0) + + # Calculate cost: tokens * (price per 1M tokens) / 1,000,000 + input_cost = prompt_tokens * pricing['input'] / 1_000_000 + output_cost = completion_tokens * pricing['output'] / 1_000_000 + + return input_cost + output_cost + + +def format_cost(cost: float) -> str: + """Format cost as a string with appropriate precision.""" + if cost < 0.001: + return f"${cost:.6f}" + elif cost < 0.01: + return f"${cost:.5f}" + else: + return f"${cost:.4f}" diff --git a/src/simulated_web_agent/main/experiment.py b/src/simulated_web_agent/main/experiment.py index 1c76da2..0e58c4b 100644 --- a/src/simulated_web_agent/main/experiment.py +++ b/src/simulated_web_agent/main/experiment.py @@ -15,11 +15,105 @@ from ..agent import context, gpt from ..executor.env import WebAgentEnv # Playwright env from .model import AgentPolicy # noqa +from .cost_calculator import format_cost log = logging.getLogger("simulated_web_agent.main.experiment") logging.basicConfig(level=logging.INFO) +def _generate_token_report(trace_dir: pathlib.Path) -> dict: + """ + Generate a token and cost report from api_trace files. + + Args: + trace_dir: Directory containing api_trace files + + Returns: + Dictionary with token and cost statistics + """ + api_trace_dir = trace_dir / "api_trace" + if not api_trace_dir.exists(): + return {} + + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + total_cost = 0.0 + api_calls_count = 0 + method_counts = {} + + # Read all api_trace files + for api_trace_file in sorted(api_trace_dir.glob("api_trace_*.json")): + try: + with open(api_trace_file, "r") as f: + data = json.load(f) + api_calls_count += 1 + + # Count by method + method = data.get("method_name", "unknown") + method_counts[method] = method_counts.get(method, 0) + 1 + + # Sum tokens + usage = data.get("usage") + if usage: + prompt_tokens = usage.get("prompt_tokens", 0) + completion_tokens = usage.get("completion_tokens", 0) + total_prompt_tokens += prompt_tokens + total_completion_tokens += completion_tokens + total_tokens += usage.get("total_tokens", 0) + + # Sum costs + cost = data.get("cost", 0.0) + if cost: + total_cost += cost + except Exception as e: + log.warning(f"Error reading {api_trace_file}: {e}") + + report = { + "api_calls": api_calls_count, + "total_prompt_tokens": total_prompt_tokens, + "total_completion_tokens": total_completion_tokens, + "total_tokens": total_tokens, + "total_cost": total_cost, + "total_cost_formatted": format_cost(total_cost), + "method_calls": method_counts, + } + + return report + + +def _save_token_report(trace_dir: pathlib.Path, report: dict) -> None: + """ + Save token report to a JSON file and print summary. + + Args: + trace_dir: Directory to save the report + report: Token report dictionary + """ + if not report: + return + + # Save as JSON + report_file = trace_dir / "token_report.json" + with open(report_file, "w") as f: + json.dump(report, f, indent=2) + + # Print summary + log.info("=" * 60) + log.info("TOKEN AND COST REPORT") + log.info("=" * 60) + log.info(f"Total API Calls: {report['api_calls']}") + log.info(f"Total Tokens Used: {report['total_tokens']:,}") + log.info(f" - Prompt Tokens: {report['total_prompt_tokens']:,}") + log.info(f" - Completion Tokens: {report['total_completion_tokens']:,}") + log.info(f"Total Cost: {report['total_cost_formatted']}") + log.info("") + log.info("API Calls by Method:") + for method, count in sorted(report['method_calls'].items()): + log.info(f" - {method}: {count} calls") + log.info("=" * 60) + + async def _run_for_persona_and_intent( cfg: DictConfig, persona_info: Dict, @@ -165,6 +259,16 @@ async def clear_cart(env): json.dump(policy.agent.memory.memories, f) print(f"Taking action {action}") print(f"Action: {steps_taken + 1} out of {max_steps}") + + # Update and display real-time token statistics + token_report = _generate_token_report(trace_dir) + if token_report: + print(f"[Token Stats] Total Calls: {token_report['api_calls']}, " + f"Tokens: {token_report['total_tokens']:,}, " + f"Cost: {token_report['total_cost_formatted']}") + # Save report immediately so it's available even if interrupted + _save_token_report(trace_dir, token_report) + obs = await env.step(action) steps_taken += 1 @@ -184,6 +288,20 @@ async def clear_cart(env): log.info(f"Saved memory trace to {trace_file}") + # ---- generate and save final token/cost report ---- + token_report = _generate_token_report(trace_dir) + _save_token_report(trace_dir, token_report) + print("\n" + "=" * 60) + print("FINAL TOKEN AND COST REPORT") + print("=" * 60) + if token_report: + print(f"Total API Calls: {token_report['api_calls']}") + print(f"Total Tokens: {token_report['total_tokens']:,}") + print(f" - Prompt Tokens: {token_report['total_prompt_tokens']:,}") + print(f" - Completion Tokens: {token_report['total_completion_tokens']:,}") + print(f"Total Cost: {token_report['total_cost_formatted']}") + print("=" * 60 + "\n") + except Exception: err = traceback.format_exc() print(err) @@ -191,6 +309,22 @@ async def clear_cart(env): (policy.run_path / "error.txt").write_text(err) # type: ignore[attr-defined] except Exception: pass + # Still generate report even if there was an error + try: + token_report = _generate_token_report(trace_dir) + if token_report: + print("\n" + "=" * 60) + print("PARTIAL TOKEN AND COST REPORT (before error)") + print("=" * 60) + print(f"Total API Calls: {token_report['api_calls']}") + print(f"Total Tokens: {token_report['total_tokens']:,}") + print(f" - Prompt Tokens: {token_report['total_prompt_tokens']:,}") + print(f" - Completion Tokens: {token_report['total_completion_tokens']:,}") + print(f"Total Cost: {token_report['total_cost_formatted']}") + print("=" * 60 + "\n") + _save_token_report(trace_dir, token_report) + except Exception: + pass finally: try: log.info(f"[{run_uid}] closing env...") diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..d63317b --- /dev/null +++ b/tools/README.md @@ -0,0 +1,161 @@ +# UXAgent API Trace 分析工具 + +這個目錄包含了幾個工具,用來幫助你閱讀和分析 UXAgent 執行過程中儲存的 LLM 輸入輸出日誌(API trace)。 + +## 工具概述 + +### 1. `format_api_trace.py` - 格式化 API Trace 檔案 + +將原始的 JSON API trace 檔案轉換成易於閱讀的 Markdown 格式。 + +**使用方式:** +```bash +python3 tools/format_api_trace.py +``` + +**例子:** +```bash +python3 tools/format_api_trace.py runs/2025-11-28_08-31-02_a1c0 +``` + +**輸出:** +- 在 `runs//api_trace_formatted/` 目錄下生成 Markdown 檔案 +- 每個 API 呼叫都會有一個檔案,例如 `api_trace_1_perceive.md` + +**包含內容:** +- 方法名稱(perceive、plan、act) +- 執行時間 +- 格式化的 System Prompt +- 用戶輸入的 HTML 頁面 +- 模型的回應 + +### 2. `generate_api_summary.py` - 生成執行摘要 + +生成整個執行過程的摘要報告,包括時間線、統計資訊和詳細的執行步驟。 + +**使用方式:** +```bash +python3 tools/generate_api_summary.py +``` + +**例子:** +```bash +python3 tools/generate_api_summary.py runs/2025-11-28_08-31-02_a1c0 +``` + +**輸出:** +- 在 `runs//api_summary.md` 生成摘要檔案 + +**包含內容:** +1. **執行時間線表格**: 顯示每個 API 呼叫的編號、方法、執行時間和預覽 +2. **方法統計**: 各個方法(perceive、plan、act)的呼叫次數和時間統計 +3. **詳細執行過程**: 按照循環組織的詳細日誌,包括每個 perceive-plan-act 循環 + +## API Trace 檔案結構 + +原始的 API trace 檔案存儲在: +``` +runs//api_trace/api_trace_.json +``` + +每個檔案包含: +```json +{ + "request": [ + [ + {"role": "system", "content": "..."}, + {"role": "user", "content": "..."} + ] + ], + "response": ["LLM output..."], + "method_name": "perceive|plan|act", + "retrieve_result": [], + "time": 51.93 +} +``` + +## Agent 的三個核心方法 + +### Perceive(觀察) +Agent 觀察當前網頁,理解頁面內容。 +- **輸入**: 頁面的 HTML 內容 +- **輸出**: 自然語言描述的頁面觀察 + +### Plan(計畫) +Agent 根據觀察和目標,制定下一步的計畫。 +- **輸入**: 觀察、目標、記憶 +- **輸出**: 計畫和理由 + +### Act(執行) +Agent 執行具體的動作(點擊、輸入文字等)。 +- **輸入**: 計畫 +- **輸出**: 具體的動作列表 + +## 範例 + +### 查看格式化的單個 API trace +```bash +# 先生成格式化的檔案 +python3 tools/format_api_trace.py runs/2025-11-28_08-31-02_a1c0 + +# 然後查看特定的檔案 +cat runs/2025-11-28_08-31-02_a1c0/api_trace_formatted/api_trace_1_perceive.md +``` + +### 查看執行摘要 +```bash +# 生成摘要 +python3 tools/generate_api_summary.py runs/2025-11-28_08-31-02_a1c0 + +# 查看摘要 +cat runs/2025-11-28_08-31-02_a1c0/api_summary.md +``` + +## 性能分析 + +使用這些工具可以快速了解 Agent 的性能: + +- **Perceive 時間**: 通常最長(30-100s),因為需要分析複雜的 HTML +- **Plan 時間**: 中等(10-30s),取決於計劃的複雜性 +- **Act 時間**: 通常較短(1-15s) + +如果某個方法花費的時間異常長,可能表示: +- 頁面內容過於複雜 +- 模型需要更多思考時間 +- 網路延遲 + +## 故障排除 + +### 檔案讀取錯誤 +如果看到 "無法讀取" 的錯誤,檢查: +1. Run 目錄的路徑是否正確 +2. API trace 檔案是否完整未損壞 + +### 編碼問題 +如果看到亂碼,確保: +1. 使用 Python 3.8+ +2. 終端支持 UTF-8 編碼 + +## 進階用法 + +### 只查看特定方法的 API traces +```bash +# 只看 perceive 方法 +ls runs/2025-11-28_08-31-02_a1c0/api_trace_formatted/*_perceive.md + +# 只看 act 方法 +ls runs/2025-11-28_08-31-02_a1c0/api_trace_formatted/*_act.md +``` + +### 搜索特定內容 +```bash +# 搜索提到 "jacket" 的所有日誌 +grep -r "jacket" runs/2025-11-28_08-31-02_a1c0/api_trace_formatted/ + +# 搜索特定的錯誤或警告 +grep -r "error\|warn" runs/2025-11-28_08-31-02_a1c0/api_trace_formatted/ +``` + +## 貢獻 + +如果你對這些工具有改進建議或發現了 bug,歡迎提出! diff --git a/tools/format_api_trace.py b/tools/format_api_trace.py new file mode 100644 index 0000000..794b30a --- /dev/null +++ b/tools/format_api_trace.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +""" +格式化 API trace 檔案的工具 +將 run 目錄中的 API trace JSON 檔案轉換成更易讀的 Markdown 或 HTML 格式 +""" + +import json +import os +import sys +from pathlib import Path +from typing import Optional, Dict, Any +import argparse + + +def format_content_for_markdown(content: str, max_width: int = 100) -> str: + """將長文本格式化為更易讀的形式""" + # 如果是 JSON,嘗試格式化 + try: + data = json.loads(content) + return json.dumps(data, ensure_ascii=False, indent=2) + except: + pass + + # 否則按寬度換行 + lines = content.split('\n') + formatted_lines = [] + for line in lines: + if len(line) > max_width: + # 簡單的換行處理 + words = line.split(' ') + current_line = "" + for word in words: + if len(current_line) + len(word) + 1 > max_width: + if current_line: + formatted_lines.append(current_line) + current_line = word + else: + if current_line: + current_line += " " + word + else: + current_line = word + if current_line: + formatted_lines.append(current_line) + else: + formatted_lines.append(line) + + return '\n'.join(formatted_lines) + + +def create_markdown_output(api_trace_data: Dict[str, Any], filename: str) -> str: + """將 API trace 轉換為 Markdown 格式""" + md = [] + md.append(f"# {filename}\n") + + # 基本資訊 + md.append(f"**方法**: {api_trace_data.get('method_name', 'N/A')}") + md.append(f"**執行時間**: {api_trace_data.get('time', 'N/A'):.2f}s\n") + + # Request + md.append("## Request\n") + requests = api_trace_data.get('request', []) + if requests and len(requests) > 0: + for msg in requests[0]: # 取第一個 request group + role = msg.get('role', 'unknown') + content = msg.get('content', '') + + if role == 'system': + md.append(f"### System Prompt\n") + # 系統提示通常很長,簡化顯示 + lines = content.split('\n') + if len(lines) > 20: + md.append(f"```\n{chr(10).join(lines[:10])}\n\n... (省略 {len(lines)-20} 行) ...\n\n{chr(10).join(lines[-10:])}\n```\n") + else: + md.append(f"```\n{content}\n```\n") + else: + md.append(f"### User Input\n") + # 嘗試格式化內容 + formatted = format_content_for_markdown(content) + if len(formatted) > 2000: + # 如果太長,只顯示前後部分 + md.append(f"```\n{formatted[:1000]}\n\n... (省略中間內容) ...\n\n{formatted[-1000:]}\n```\n") + else: + md.append(f"```\n{formatted}\n```\n") + + # Response + md.append("## Response\n") + responses = api_trace_data.get('response', []) + if responses: + response_text = responses[0] if isinstance(responses, list) else str(responses) + formatted_response = format_content_for_markdown(response_text) + + if len(formatted_response) > 2000: + md.append(f"```\n{formatted_response[:1500]}\n\n... (省略中間內容) ...\n\n{formatted_response[-500:]}\n```\n") + else: + md.append(f"```\n{formatted_response}\n```\n") + + return '\n'.join(md) + + +def format_api_trace_files(run_path: str, output_format: str = 'markdown', output_dir: Optional[str] = None): + """ + 格式化一個 run 目錄下的所有 API trace 檔案 + + Args: + run_path: runs 目錄下的某個 run 的路徑 + output_format: 輸出格式 ('markdown' 或 'html') + output_dir: 輸出目錄,如果為 None 則在 run 目錄中建立 + """ + run_path = Path(run_path) + api_trace_dir = run_path / "api_trace" + + if not api_trace_dir.exists(): + print(f"錯誤: {api_trace_dir} 不存在") + return + + if output_dir is None: + output_dir = run_path / "api_trace_formatted" + else: + output_dir = Path(output_dir) + + output_dir.mkdir(parents=True, exist_ok=True) + + # 取得所有 api_trace_{number}.json 檔案 + api_trace_files = sorted(api_trace_dir.glob("api_trace_*.json"), key=lambda x: int(x.stem.split('_')[2])) + + print(f"發現 {len(api_trace_files)} 個 API trace 檔案\n") + print(f"輸出到: {output_dir}\n") + + for api_file in api_trace_files: + try: + with open(api_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # 建立輸出檔名 + base_name = api_file.stem # api_trace_1 + method_name = data.get('method_name', 'unknown') + output_filename = f"{base_name}_{method_name}.md" + output_path = output_dir / output_filename + + # 生成 Markdown 內容 + content = create_markdown_output(data, f"{base_name} ({method_name})") + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"✓ {output_filename} ({data.get('time', 0):.2f}s)") + + except Exception as e: + print(f"✗ {api_file.name} - 錯誤: {e}") + + print(f"\n完成!所有檔案已輸出到 {output_dir}") + + +def main(): + parser = argparse.ArgumentParser( + description='格式化 UXAgent API trace 檔案', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +範例: + python3 format_api_trace.py /Users/akiraeason/Desktop/UXAgent/runs/2025-11-28_08-31-02_a1c0 + python3 format_api_trace.py /Users/akiraeason/Desktop/UXAgent/runs/2025-11-28_08-31-02_a1c0 -o ./output + """ + ) + + parser.add_argument('run_path', help='run 目錄的路徑') + parser.add_argument('-o', '--output', help='輸出目錄 (預設: run_path/api_trace_formatted)') + parser.add_argument('-f', '--format', choices=['markdown', 'html'], default='markdown', help='輸出格式') + + args = parser.parse_args() + + format_api_trace_files(args.run_path, args.format, args.output) + + +if __name__ == '__main__': + main() diff --git a/tools/generate_api_summary.py b/tools/generate_api_summary.py new file mode 100644 index 0000000..f2fca6f --- /dev/null +++ b/tools/generate_api_summary.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +生成 API trace 執行過程的摘要報告 +包括時間線、方法統計、執行時間分析等 +""" + +import json +import os +from pathlib import Path +from typing import List, Dict, Any +import argparse +from collections import defaultdict + + +def load_api_traces(api_trace_dir: Path) -> List[tuple[int, Dict[str, Any]]]: + """讀取所有 API trace 檔案,按編號排序""" + traces = [] + for file in sorted(api_trace_dir.glob("api_trace_*.json"), key=lambda x: int(x.stem.split('_')[2])): + trace_num = int(file.stem.split('_')[2]) + try: + with open(file, 'r', encoding='utf-8') as f: + data = json.load(f) + traces.append((trace_num, data)) + except Exception as e: + print(f"警告: 無法讀取 {file.name}: {e}") + + return traces + + +def extract_user_input_preview(request_data: list, max_chars: int = 200) -> str: + """從 request 中提取用戶輸入的預覽""" + if not request_data or len(request_data) < 2: + return "N/A" + + user_msg = None + for msg in request_data[0]: + if msg.get('role') == 'user': + user_msg = msg.get('content', '') + break + + if not user_msg: + return "N/A" + + # 移除 JSON 引號和逃脫字符 + preview = user_msg.replace('\\"', '"').replace('\\n', ' ') + + # 嘗試解析 JSON 以獲得更好的預覽 + try: + if preview.startswith('"'): + preview = preview[1:-1] + except: + pass + + # 截短預覽 + if len(preview) > max_chars: + return preview[:max_chars] + "..." + return preview + + +def extract_response_preview(response_data: list, max_chars: int = 150) -> str: + """從 response 中提取回覆的預覽""" + if not response_data: + return "N/A" + + response_text = response_data[0] if isinstance(response_data, list) else str(response_data) + + # 嘗試解析 JSON + try: + data = json.loads(response_text) + # 如果是 observations,取第一個觀察 + if 'observations' in data and data['observations']: + preview = data['observations'][0] + else: + preview = json.dumps(data, ensure_ascii=False)[:max_chars] + except: + preview = response_text + + if len(preview) > max_chars: + return preview[:max_chars] + "..." + return preview + + +def generate_summary_report(run_path: str, output_file: str = None) -> str: + """生成 API 執行摘要報告""" + run_path = Path(run_path) + api_trace_dir = run_path / "api_trace" + + if not api_trace_dir.exists(): + return f"錯誤: {api_trace_dir} 不存在" + + traces = load_api_traces(api_trace_dir) + + if not traces: + return f"錯誤: 在 {api_trace_dir} 中找不到 API trace 檔案" + + # 統計資訊 + method_counts = defaultdict(int) + method_times = defaultdict(float) + total_time = 0.0 + method_details = defaultdict(list) + + # 構建報告 + report = [] + report.append("# UXAgent API 執行摘要\n") + report.append(f"Run: {run_path.name}\n") + report.append(f"API Trace 檔案數: {len(traces)}\n\n") + + # 時間線 + report.append("## 執行時間線\n") + report.append("| # | 方法 | 時間 (s) | 預覽 |\n") + report.append("|---|------|---------|------|\n") + + for trace_num, data in traces: + method = data.get('method_name', 'unknown') + time_taken = data.get('time', 0) + + # 統計 + method_counts[method] += 1 + method_times[method] += time_taken + total_time += time_taken + + # 提取預覽 + user_preview = extract_user_input_preview(data.get('request', []), max_chars=50) + response_preview = extract_response_preview(data.get('response', []), max_chars=50) + + # 根據方法選擇顯示的預覽 + if method == 'perceive': + preview = f"觀察到: {response_preview[:40]}" + elif method == 'plan': + preview = f"計畫: {response_preview[:40]}" + elif method == 'act': + preview = f"行動: {response_preview[:40]}" + else: + preview = response_preview[:40] + + report.append(f"| {trace_num} | {method} | {time_taken:.2f} | {preview} |\n") + + # 統計摘要 + report.append("\n## 方法統計\n") + report.append("| 方法 | 次數 | 總時間 (s) | 平均時間 (s) |\n") + report.append("|------|------|-----------|-------------|\n") + + for method in sorted(method_counts.keys()): + count = method_counts[method] + total = method_times[method] + avg = total / count if count > 0 else 0 + report.append(f"| {method} | {count} | {total:.2f} | {avg:.2f} |\n") + + report.append(f"\n**總執行時間**: {total_time:.2f}s\n") + + # 方法詳細列表 + report.append("\n## 詳細執行過程\n") + + current_cycle = 0 + perceived_step = False + + for trace_num, data in traces: + method = data.get('method_name', 'unknown') + time_taken = data.get('time', 0) + + # 追蹤循環 (perceive -> plan -> act) + if method == 'perceive': + current_cycle += 1 + perceived_step = True + report.append(f"\n### 循環 {current_cycle}\n") + + if perceived_step: + report.append(f"\n**{method.upper()}** (#{trace_num}, {time_taken:.2f}s)\n") + + # 根據方法提供不同的摘要 + request_data = data.get('request', []) + response_data = data.get('response', []) + + if method == 'perceive': + # 顯示觀察到的內容摘要 + preview = extract_response_preview(response_data, max_chars=300) + report.append(f"觀察: {preview}\n") + + elif method == 'plan': + # 顯示計畫內容 + preview = extract_response_preview(response_data, max_chars=300) + report.append(f"計畫: {preview}\n") + + elif method == 'act': + # 顯示執行的動作 + preview = extract_response_preview(response_data, max_chars=200) + report.append(f"動作: {preview}\n") + + report_str = ''.join(report) + + # 寫入檔案 + if output_file is None: + output_file = run_path / "api_summary.md" + else: + output_file = Path(output_file) + + with open(output_file, 'w', encoding='utf-8') as f: + f.write(report_str) + + print(f"✓ 摘要報告已生成: {output_file}") + return report_str + + +def main(): + parser = argparse.ArgumentParser( + description='生成 UXAgent API trace 執行摘要', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +範例: + python3 generate_api_summary.py /Users/akiraeason/Desktop/UXAgent/runs/2025-11-28_08-31-02_a1c0 + python3 generate_api_summary.py /Users/akiraeason/Desktop/UXAgent/runs/2025-11-28_08-31-02_a1c0 -o ./summary.md + """ + ) + + parser.add_argument('run_path', help='run 目錄的路徑') + parser.add_argument('-o', '--output', help='輸出檔案路徑') + + args = parser.parse_args() + + generate_summary_report(args.run_path, args.output) + + +if __name__ == '__main__': + main()