diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml index 399278f78eb..f6111dbc61f 100644 --- a/.github/workflows/pr-smoke-test.yml +++ b/.github/workflows/pr-smoke-test.yml @@ -181,21 +181,11 @@ jobs: - name: Make Binary Executable run: chmod +x target/debug/goose - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '22' - - - name: Install agentic providers - run: npm install -g @anthropic-ai/claude-code @openai/codex @google/gemini-cli - - name: Run Provider Tests (Code Execution Mode) env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GEMINI_API_KEY: ${{ secrets.GOOGLE_API_KEY }} DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} @@ -208,7 +198,7 @@ jobs: run: | mkdir -p $HOME/.local/share/goose/sessions mkdir -p $HOME/.config/goose - bash scripts/test_providers.sh --code-exec + bash scripts/test_providers_code_exec.sh compaction-tests: name: Compaction Tests diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index b8a302c3e25..203cef6bf99 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -1,437 +1,57 @@ #!/bin/bash -# Test providers with optional code_execution mode -# Usage: -# ./test_providers.sh # Normal mode (direct tool calls) -# ./test_providers.sh --code-exec # Code execution mode (JS batching) -# -# Environment variables: -# SKIP_PROVIDERS Comma-separated list of providers to skip (e.g., "tetrate,xai") -# SKIP_BUILD Skip the cargo build step if set -CODE_EXEC_MODE=false -for arg in "$@"; do - case $arg in - --code-exec) - CODE_EXEC_MODE=true - ;; - esac -done +LIB_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$LIB_DIR/test_providers_lib.sh" -# Flaky models that are allowed to fail without failing the entire test run. -# These are typically preview/experimental models with inconsistent tool-calling behavior. -# Failures are still reported but don't block PRs. -ALLOWED_FAILURES=( - "google:gemini-2.5-flash" - "google:gemini-3-pro-preview" - "openrouter:nvidia/nemotron-3-nano-30b-a3b" - "openai:gpt-3.5-turbo" -) - -# Agentic providers handle tools internally and return text results. -# They can't produce the normal tool-call log patterns (e.g. "shell | developer"). -AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent") - -if [ -f .env ]; then - export $(grep -v '^#' .env | xargs) -fi - -if [ -z "$SKIP_BUILD" ]; then - echo "Building goose..." - cargo build --bin goose - echo "" -else - echo "Skipping build (SKIP_BUILD is set)..." - echo "" -fi +echo "Mode: normal (direct tool calls)" +echo "" -SCRIPT_DIR=$(pwd) +GOOSE_BIN=$(build_goose) +BUILTINS="developer" -# Create a test file with known content in the current directory -# This cannot be /tmp as some agents cannot work outside the PWD mkdir -p target TEST_CONTENT="test-content-abc123" TEST_FILE="./target/test-content.txt" echo "$TEST_CONTENT" > "$TEST_FILE" -# Format: "provider -> model1|model2|model3" -# Base providers that are always tested (with appropriate env vars) -PROVIDERS=( - "openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b" - "xai -> grok-3" - "openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5" - "anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805" - "google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview" - "tetrate -> claude-sonnet-4-20250514" -) - -# Conditionally add providers based on environment variables - -# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN -if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then - echo "✓ Including Databricks tests" - PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o") -else - echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)" -fi - -# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME -if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then - echo "✓ Including Azure OpenAI tests" - PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}") -else - echo "⚠️ Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)" -fi - -# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION -if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then - echo "✓ Including AWS Bedrock tests" - PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0") -else - echo "⚠️ Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)" -fi - -# GCP Vertex AI: requires GCP_PROJECT_ID -if [ -n "$GCP_PROJECT_ID" ]; then - echo "✓ Including GCP Vertex AI tests" - PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro") -else - echo "⚠️ Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)" -fi - -# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN -if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then - echo "✓ Including Snowflake tests" - PROVIDERS+=("snowflake -> claude-sonnet-4-5") -else - echo "⚠️ Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)" -fi - -# Venice: requires VENICE_API_KEY -if [ -n "$VENICE_API_KEY" ]; then - echo "✓ Including Venice tests" - PROVIDERS+=("venice -> llama-3.3-70b") -else - echo "⚠️ Skipping Venice tests (VENICE_API_KEY required)" -fi - -# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST) -if [ -n "$LITELLM_API_KEY" ]; then - echo "✓ Including LiteLLM tests" - PROVIDERS+=("litellm -> gpt-4o-mini") -else - echo "⚠️ Skipping LiteLLM tests (LITELLM_API_KEY required)" -fi - -# Ollama: requires OLLAMA_HOST (or uses default localhost:11434) -if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then - echo "✓ Including Ollama tests" - PROVIDERS+=("ollama -> qwen3") -else - echo "⚠️ Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)" -fi - -# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME -if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then - echo "✓ Including SageMaker TGI tests" - PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint") -else - echo "⚠️ Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)" -fi - -# GitHub Copilot: requires OAuth setup (check for cached token) -if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then - echo "✓ Including GitHub Copilot tests" - PROVIDERS+=("github_copilot -> gpt-4.1") -else - echo "⚠️ Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)" -fi - -# ChatGPT Codex: requires OAuth setup -if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then - echo "✓ Including ChatGPT Codex tests" - PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex") -else - echo "⚠️ Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)" -fi - -# CLI-based providers (require the CLI tool to be installed) - -# Claude Code CLI: requires 'claude' CLI tool -if command -v claude &> /dev/null; then - echo "✓ Including Claude Code CLI tests" - PROVIDERS+=("claude-code -> claude-sonnet-4-20250514") -else - echo "⚠️ Skipping Claude Code CLI tests ('claude' CLI tool required)" -fi - -# Codex CLI: requires 'codex' CLI tool -if command -v codex &> /dev/null; then - echo "✓ Including Codex CLI tests" - PROVIDERS+=("codex -> gpt-5.2-codex") -else - echo "⚠️ Skipping Codex CLI tests ('codex' CLI tool required)" -fi - -# Gemini CLI: requires 'gemini' CLI tool -if command -v gemini &> /dev/null; then - echo "✓ Including Gemini CLI tests" - PROVIDERS+=("gemini-cli -> gemini-2.5-pro") -else - echo "⚠️ Skipping Gemini CLI tests ('gemini' CLI tool required)" -fi - -# Cursor Agent: requires 'cursor-agent' CLI tool -if command -v cursor-agent &> /dev/null; then - echo "✓ Including Cursor Agent tests" - PROVIDERS+=("cursor-agent -> auto") -else - echo "⚠️ Skipping Cursor Agent tests ('cursor-agent' CLI tool required)" -fi - -echo "" - -# Configure mode-specific settings -if [ "$CODE_EXEC_MODE" = true ]; then - echo "Mode: code_execution (JS batching)" - BUILTINS="developer,code_execution" - # Match code_execution tool usage: - # - "execute | code_execution" or "get_function_details | code_execution" (fallback format) - # - "tool call | execute" or "tool calls | execute" (new format with tool_graph) - SUCCESS_PATTERN="(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)" - SUCCESS_MSG="code_execution tool called" - FAILURE_MSG="no code_execution tools called" -else - echo "Mode: normal (direct tool calls)" - BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager" - SUCCESS_PATTERN="shell \| developer" - SUCCESS_MSG="developer tool called" - FAILURE_MSG="no developer tools called" -fi -echo "" - -is_allowed_failure() { - local provider="$1" - local model="$2" - local key="${provider}:${model}" - for allowed in "${ALLOWED_FAILURES[@]}"; do - if [ "$allowed" = "$key" ]; then - return 0 - fi - done - return 1 -} - -should_skip_provider() { - local provider="$1" - if [ -z "$SKIP_PROVIDERS" ]; then - return 1 - fi - IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS" - for skip in "${SKIP_LIST[@]}"; do - # Trim whitespace - skip=$(echo "$skip" | xargs) - if [ "$skip" = "$provider" ]; then - return 0 - fi - done - return 1 -} - -is_agentic_provider() { - local provider="$1" - for agentic in "${AGENTIC_PROVIDERS[@]}"; do - if [ "$agentic" = "$provider" ]; then - return 0 - fi - done - return 1 -} - -# Create temp directory for results -RESULTS_DIR=$(mktemp -d) -trap "rm -rf $RESULTS_DIR" EXIT - -# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL) -MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)} -echo "Running tests with up to $MAX_PARALLEL parallel jobs" -echo "" - -# Function to run a single test run_test() { - local provider="$1" - local model="$2" - local result_file="$3" - local output_file="$4" - + local provider="$1" model="$2" result_file="$3" output_file="$4" local testdir=$(mktemp -d) - # Agentic providers use a file-read prompt with known content marker; - # regular providers use the shell prompt that produces tool-call logs. local prompt if is_agentic_provider "$provider"; then cp "$TEST_FILE" "$testdir/test-content.txt" prompt="read ./test-content.txt and output its contents exactly" else - echo "hello" > "$testdir/hello.txt" - prompt="Immediately use the shell tool to run 'ls'. Do not ask for confirmation." + echo "$TEST_CONTENT" > "$testdir/input.txt" + prompt="Use the text_editor view command to read ./input.txt, then output this file's contents in UPPERCASE. Do NOT use any other tool in Developer" fi - # Run the test and capture output ( export GOOSE_PROVIDER="$provider" export GOOSE_MODEL="$model" - cd "$testdir" && "$SCRIPT_DIR/target/debug/goose" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 + cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 ) > "$output_file" 2>&1 - # Check result: agentic providers return text containing the test content - # instead of producing tool-call log patterns if is_agentic_provider "$provider"; then if grep -qi "$TEST_CONTENT" "$output_file"; then - echo "success" > "$result_file" + echo "success|test content found by model" > "$result_file" else - echo "failure" > "$result_file" + echo "failure|test content not found by model" > "$result_file" fi - elif grep -qE "$SUCCESS_PATTERN" "$output_file"; then - echo "success" > "$result_file" else - echo "failure" > "$result_file" - fi - - rm -rf "$testdir" -} - -# Build list of all provider/model combinations -JOBS=() -job_index=0 -for provider_config in "${PROVIDERS[@]}"; do - PROVIDER="${provider_config%% -> *}" - MODELS_STR="${provider_config#* -> }" - - # Skip provider if it's in SKIP_PROVIDERS - if should_skip_provider "$PROVIDER"; then - echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)" - continue - fi - - # Agentic providers don't use goose's code_execution system - if [ "$CODE_EXEC_MODE" = true ] && is_agentic_provider "$PROVIDER"; then - echo "⊘ Skipping agentic provider in code_exec mode: ${PROVIDER}" - continue - fi - - IFS='|' read -ra MODELS <<< "$MODELS_STR" - for MODEL in "${MODELS[@]}"; do - JOBS+=("$PROVIDER|$MODEL|$job_index") - ((job_index++)) - done -done - -total_jobs=${#JOBS[@]} -echo "Starting $total_jobs tests..." -echo "" - -# Run first test sequentially if any jobs exist -if [ $total_jobs -gt 0 ]; then - echo "Running first test sequentially..." - first_job="${JOBS[0]}" - IFS='|' read -r provider model idx <<< "$first_job" - - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" - meta_file="$RESULTS_DIR/meta_$idx" - echo "$provider|$model" > "$meta_file" - - # Run first test and wait for it to complete - run_test "$provider" "$model" "$result_file" "$output_file" - echo "First test completed." - echo "" -fi - -# Run remaining tests in parallel -if [ $total_jobs -gt 1 ]; then - echo "Running remaining tests in parallel..." - running_jobs=0 - for ((i=1; i<$total_jobs; i++)); do - job="${JOBS[$i]}" - IFS='|' read -r provider model idx <<< "$job" - - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" - meta_file="$RESULTS_DIR/meta_$idx" - echo "$provider|$model" > "$meta_file" - - # Run test in background - run_test "$provider" "$model" "$result_file" "$output_file" & - ((running_jobs++)) - - # Wait if we've hit the parallel limit - if [ $running_jobs -ge $MAX_PARALLEL ]; then - wait -n 2>/dev/null || wait - ((running_jobs--)) - fi - done - - # Wait for all remaining jobs - wait -fi - -echo "" -echo "=== Test Results ===" -echo "" - -# Collect results -RESULTS=() -HARD_FAILURES=() - -for job in "${JOBS[@]}"; do - IFS='|' read -r provider model idx <<< "$job" - - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" - - echo "Provider: $provider" - echo "Model: $model" - echo "" - cat "$output_file" - echo "" - - if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then - echo "✓ SUCCESS: Test passed - $SUCCESS_MSG" - RESULTS+=("✓ ${provider}: ${model}") - else - if is_allowed_failure "$provider" "$model"; then - echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG" - RESULTS+=("⚠ ${provider}: ${model} (flaky)") + if ! grep -q "text_editor | developer" "$output_file"; then + echo "failure|model did not use text_editor tool" > "$result_file" + elif ! grep -q "TEST-CONTENT-ABC123" "$output_file"; then + echo "failure|model did not return uppercased file content" > "$result_file" else - echo "✗ FAILED: Test failed - $FAILURE_MSG" - RESULTS+=("✗ ${provider}: ${model}") - HARD_FAILURES+=("${provider}: ${model}") + echo "success|model read and uppercased file content" > "$result_file" fi fi - echo "---" -done -echo "" -echo "=== Test Summary ===" -for result in "${RESULTS[@]}"; do - echo "$result" -done + rm -rf "$testdir" +} -if [ ${#HARD_FAILURES[@]} -gt 0 ]; then - echo "" - echo "Hard failures (${#HARD_FAILURES[@]}):" - for failure in "${HARD_FAILURES[@]}"; do - echo " - $failure" - done - echo "" - echo "Some tests failed!" - exit 1 -else - if echo "${RESULTS[@]}" | grep -q "⚠"; then - echo "" - echo "All required tests passed! (some flaky tests failed but are allowed)" - else - echo "" - echo "All tests passed!" - fi -fi +build_test_cases +run_test_cases run_test +report_results diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh new file mode 100755 index 00000000000..d0737c37cef --- /dev/null +++ b/scripts/test_providers_code_exec.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Provider smoke tests - code execution mode (JS batching) + +LIB_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$LIB_DIR/test_providers_lib.sh" + +echo "Mode: code_execution (JS batching)" +echo "" + +# --- Setup --- + +GOOSE_BIN=$(build_goose) +BUILTINS="developer,code_execution" + +# --- Test case --- + +run_test() { + local provider="$1" model="$2" result_file="$3" output_file="$4" + local testdir=$(mktemp -d) + + echo "hello" > "$testdir/hello.txt" + local prompt="Run 'ls' to list files in the current directory." + + # Run goose + ( + export GOOSE_PROVIDER="$provider" + export GOOSE_MODEL="$model" + cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 + ) > "$output_file" 2>&1 + + # Verify: code_execution tool must be called + # Matches: "execute | code_execution", "get_function_details | code_execution", + # "tool call | execute", "tool calls | execute" + if grep -qE "(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)" "$output_file"; then + echo "success|code_execution tool called" > "$result_file" + else + echo "failure|no code_execution tool calls found" > "$result_file" + fi + + rm -rf "$testdir" +} + +build_test_cases --skip-agentic +run_test_cases run_test +report_results diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh new file mode 100755 index 00000000000..b56f13a8998 --- /dev/null +++ b/scripts/test_providers_lib.sh @@ -0,0 +1,243 @@ +#!/bin/bash + +PROVIDER_CONFIG=" +openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b +xai -> grok-3 +openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5 +anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101 +google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview +tetrate -> claude-sonnet-4-20250514 +databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o +azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME} +aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0 +gcp_vertex_ai -> gemini-2.5-pro +snowflake -> claude-sonnet-4-5 +venice -> llama-3.3-70b +litellm -> gpt-4o-mini +sagemaker_tgi -> sagemaker-tgi-endpoint +github_copilot -> gpt-4.1 +chatgpt_codex -> gpt-5.1-codex +claude-code -> claude-sonnet-4-20250514 +codex -> gpt-5.2-codex +gemini-cli -> gemini-2.5-pro +cursor-agent -> auto +ollama -> qwen3 +" + +# Flaky models allowed to fail without blocking PRs. +ALLOWED_FAILURES=( + "google:gemini-2.5-flash" + "google:gemini-3-pro-preview" + "openrouter:nvidia/nemotron-3-nano-30b-a3b" + "openai:gpt-3.5-turbo" +) + +AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent") + +if [ -f .env ]; then + export $(grep -v '^#' .env | xargs) +fi + +build_goose() { + if [ -z "$SKIP_BUILD" ]; then + echo "Building goose..." >&2 + cargo build --bin goose >&2 + echo "" >&2 + else + echo "Skipping build (SKIP_BUILD is set)..." >&2 + echo "" >&2 + fi + + echo "$(pwd)/target/debug/goose" +} + +has_env() { [ -n "${!1}" ]; } +has_cmd() { command -v "$1" &>/dev/null; } +has_file() { [ -f "$1" ]; } + +is_provider_available() { + case "$1" in + openrouter) has_env OPENROUTER_API_KEY ;; + xai) has_env XAI_API_KEY ;; + openai) has_env OPENAI_API_KEY ;; + anthropic) has_env ANTHROPIC_API_KEY ;; + google) has_env GOOGLE_API_KEY ;; + tetrate) has_env TETRATE_API_KEY ;; + databricks) has_env DATABRICKS_HOST && has_env DATABRICKS_TOKEN ;; + azure_openai) has_env AZURE_OPENAI_ENDPOINT && has_env AZURE_OPENAI_DEPLOYMENT_NAME ;; + aws_bedrock) has_env AWS_REGION && { has_env AWS_PROFILE || has_env AWS_ACCESS_KEY_ID; } ;; + gcp_vertex_ai) has_env GCP_PROJECT_ID ;; + snowflake) has_env SNOWFLAKE_HOST && has_env SNOWFLAKE_TOKEN ;; + venice) has_env VENICE_API_KEY ;; + litellm) has_env LITELLM_API_KEY ;; + sagemaker_tgi) has_env SAGEMAKER_ENDPOINT_NAME && has_env AWS_REGION ;; + github_copilot) has_env GITHUB_COPILOT_TOKEN || has_file "$HOME/.config/goose/github_copilot_token.json" ;; + chatgpt_codex) has_env CHATGPT_CODEX_TOKEN || has_file "$HOME/.config/goose/chatgpt_codex_token.json" ;; + ollama) has_env OLLAMA_HOST || has_cmd ollama ;; + claude-code) has_cmd claude ;; + codex) has_cmd codex ;; + gemini-cli) has_cmd gemini ;; + cursor-agent) has_cmd cursor-agent ;; + *) return 0 ;; + esac +} + +is_allowed_failure() { + local key="${1}:${2}" + for allowed in "${ALLOWED_FAILURES[@]}"; do + [ "$allowed" = "$key" ] && return 0 + done + return 1 +} + +should_skip_provider() { + [ -z "$SKIP_PROVIDERS" ] && return 1 + IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS" + for skip in "${SKIP_LIST[@]}"; do + skip=$(echo "$skip" | xargs) + [ "$skip" = "$1" ] && return 0 + done + return 1 +} + +is_agentic_provider() { + for agentic in "${AGENTIC_PROVIDERS[@]}"; do + [ "$agentic" = "$1" ] && return 0 + done + return 1 +} + +# build_test_cases [--skip-agentic] +build_test_cases() { + local skip_agentic=false + [ "$1" = "--skip-agentic" ] && skip_agentic=true + + local providers=() + while IFS= read -r line; do + [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue + local provider="${line%% -> *}" + if is_provider_available "$provider"; then + providers+=("$line") + echo "✓ Including $provider" + else + echo "⚠️ Skipping $provider (prerequisites not met)" + fi + done <<< "$PROVIDER_CONFIG" + echo "" + + TEST_CASES=() + local job_index=0 + for provider_config in "${providers[@]}"; do + local provider="${provider_config%% -> *}" + local models_str="${provider_config#* -> }" + + if should_skip_provider "$provider"; then + echo "⊘ Skipping provider: ${provider} (SKIP_PROVIDERS)" + continue + fi + + if [ "$skip_agentic" = true ] && is_agentic_provider "$provider"; then + echo "⊘ Skipping agentic provider: ${provider}" + continue + fi + + IFS='|' read -ra models <<< "$models_str" + for model in "${models[@]}"; do + TEST_CASES+=("$provider|$model|$job_index") + ((job_index++)) + done + done +} + +# run_test_cases +run_test_cases() { + local test_fn="$1" + + RESULTS_DIR=$(mktemp -d) + trap 'if [ -n "${RESULTS_DIR:-}" ]; then rm -rf -- "$RESULTS_DIR"; fi; if [ -n "${CLEANUP_DIR:-}" ]; then rm -rf -- "$CLEANUP_DIR"; fi' EXIT + MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)} + echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)" + echo "" + + local running=0 + for ((i=0; i<${#TEST_CASES[@]}; i++)); do + IFS='|' read -r provider model idx <<< "${TEST_CASES[$i]}" + + if [ $i -eq 0 ]; then + # First test runs sequentially to catch early failures + "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" + else + "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" & + ((running++)) + if [ $running -ge $MAX_PARALLEL ]; then + wait -n 2>/dev/null || wait + ((running--)) + fi + fi + done + wait +} + +report_results() { + echo "" + echo "=== Test Results ===" + echo "" + + RESULTS=() + HARD_FAILURES=() + + for job in "${TEST_CASES[@]}"; do + IFS='|' read -r provider model idx <<< "$job" + + echo "Provider: $provider" + echo "Model: $model" + echo "" + cat "$RESULTS_DIR/output_$idx" + echo "" + + local result_line="" + [ -f "$RESULTS_DIR/result_$idx" ] && result_line=$(cat "$RESULTS_DIR/result_$idx") + local status="${result_line%%|*}" + local msg="${result_line#*|}" + + if [ "$status" = "success" ]; then + echo "✓ SUCCESS: $msg" + RESULTS+=("✓ ${provider}: ${model}") + else + if is_allowed_failure "$provider" "$model"; then + echo "⚠ FLAKY: $msg" + RESULTS+=("⚠ ${provider}: ${model} (flaky)") + else + echo "✗ FAILED: $msg" + RESULTS+=("✗ ${provider}: ${model}") + HARD_FAILURES+=("${provider}: ${model}") + fi + fi + echo "---" + done + + echo "" + echo "=== Test Summary ===" + for result in "${RESULTS[@]}"; do + echo "$result" + done + + if [ ${#HARD_FAILURES[@]} -gt 0 ]; then + echo "" + echo "Hard failures (${#HARD_FAILURES[@]}):" + for failure in "${HARD_FAILURES[@]}"; do + echo " - $failure" + done + echo "" + echo "Some tests failed!" + exit 1 + else + if echo "${RESULTS[@]}" | grep -q "⚠"; then + echo "" + echo "All required tests passed! (some flaky tests failed but are allowed)" + else + echo "" + echo "All tests passed!" + fi + fi +}