Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 248 additions & 40 deletions scripts/test_providers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ fi
SCRIPT_DIR=$(pwd)

# Format: "provider -> model1|model2|model3"
# Base providers that are always tested (with appropriate env vars)
PROVIDERS=(
"openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b"
"xai -> grok-3"
Expand All @@ -50,20 +51,132 @@ PROVIDERS=(
"tetrate -> claude-sonnet-4-20250514"
)

# In CI, only run Databricks tests if DATABRICKS_HOST and DATABRICKS_TOKEN are set
# Locally, always run Databricks tests
if [ -n "$CI" ]; then
if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then
echo "✓ Including Databricks tests"
PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o")
else
echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required in CI)"
fi
else
# Conditionally add providers based on environment variables

# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN
if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then
echo "✓ Including Databricks tests"
PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o")
else
echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)"
fi

# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME
if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then
echo "✓ Including Azure OpenAI tests"
PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}")
else
echo "⚠️ Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)"
fi

# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION
if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then
echo "✓ Including AWS Bedrock tests"
PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0")
else
echo "⚠️ Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)"
fi

# GCP Vertex AI: requires GCP_PROJECT_ID
if [ -n "$GCP_PROJECT_ID" ]; then
echo "✓ Including GCP Vertex AI tests"
PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro")
else
echo "⚠️ Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)"
fi

# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN
if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then
echo "✓ Including Snowflake tests"
PROVIDERS+=("snowflake -> claude-sonnet-4-5")
else
echo "⚠️ Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)"
fi

# Venice: requires VENICE_API_KEY
if [ -n "$VENICE_API_KEY" ]; then
echo "✓ Including Venice tests"
PROVIDERS+=("venice -> llama-3.3-70b")
else
echo "⚠️ Skipping Venice tests (VENICE_API_KEY required)"
fi

# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST)
if [ -n "$LITELLM_API_KEY" ]; then
echo "✓ Including LiteLLM tests"
PROVIDERS+=("litellm -> gpt-4o-mini")
else
echo "⚠️ Skipping LiteLLM tests (LITELLM_API_KEY required)"
fi

# Ollama: requires OLLAMA_HOST (or uses default localhost:11434)
if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then
echo "✓ Including Ollama tests"
PROVIDERS+=("ollama -> qwen3")
else
echo "⚠️ Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)"
fi

# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME
if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then
echo "✓ Including SageMaker TGI tests"
PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint")
else
echo "⚠️ Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)"
fi

# GitHub Copilot: requires OAuth setup (check for cached token)
if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then
echo "✓ Including GitHub Copilot tests"
PROVIDERS+=("github_copilot -> gpt-4.1")
else
echo "⚠️ Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)"
fi

# ChatGPT Codex: requires OAuth setup
if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then
echo "✓ Including ChatGPT Codex tests"
PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex")
else
echo "⚠️ Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)"
fi

# CLI-based providers (require the CLI tool to be installed)

# Claude Code CLI: requires 'claude' CLI tool
if command -v claude &> /dev/null; then
echo "✓ Including Claude Code CLI tests"
PROVIDERS+=("claude-code -> claude-sonnet-4-20250514")
else
echo "⚠️ Skipping Claude Code CLI tests ('claude' CLI tool required)"
fi

# Codex CLI: requires 'codex' CLI tool
if command -v codex &> /dev/null; then
echo "✓ Including Codex CLI tests"
PROVIDERS+=("codex -> gpt-5.2-codex")
else
echo "⚠️ Skipping Codex CLI tests ('codex' CLI tool required)"
fi

# Gemini CLI: requires 'gemini' CLI tool
if command -v gemini &> /dev/null; then
echo "✓ Including Gemini CLI tests"
PROVIDERS+=("gemini-cli -> gemini-2.5-pro")
else
echo "⚠️ Skipping Gemini CLI tests ('gemini' CLI tool required)"
fi

# Cursor Agent: requires 'cursor-agent' CLI tool
if command -v cursor-agent &> /dev/null; then
echo "✓ Including Cursor Agent tests"
PROVIDERS+=("cursor-agent -> auto")
else
echo "⚠️ Skipping Cursor Agent tests ('cursor-agent' CLI tool required)"
fi

echo ""

# Configure mode-specific settings
if [ "$CODE_EXEC_MODE" = true ]; then
echo "Mode: code_execution (JS batching)"
Expand Down Expand Up @@ -111,52 +224,147 @@ should_skip_provider() {
return 1
}

RESULTS=()
HARD_FAILURES=()
# Create temp directory for results
RESULTS_DIR=$(mktemp -d)
trap "rm -rf $RESULTS_DIR" EXIT

# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL)
MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)}
echo "Running tests with up to $MAX_PARALLEL parallel jobs"
echo ""

# Function to run a single test
run_test() {
local provider="$1"
local model="$2"
local result_file="$3"
local output_file="$4"

local testdir=$(mktemp -d)
echo "hello" > "$testdir/hello.txt"

# Run the test and capture output
(
export GOOSE_PROVIDER="$provider"
export GOOSE_MODEL="$model"
cd "$testdir" && "$SCRIPT_DIR/target/release/goose" run --text "Immediately use the shell tool to run 'ls'. Do not ask for confirmation." --with-builtin "$BUILTINS" 2>&1
) > "$output_file" 2>&1

# Check result
if grep -qE "$SUCCESS_PATTERN" "$output_file"; then
echo "success" > "$result_file"
else
echo "failure" > "$result_file"
fi

rm -rf "$testdir"
}

# Build list of all provider/model combinations
JOBS=()
job_index=0
for provider_config in "${PROVIDERS[@]}"; do
# Split on " -> " to get provider and models
PROVIDER="${provider_config%% -> *}"
MODELS_STR="${provider_config#* -> }"

# Skip provider if it's in SKIP_PROVIDERS
if should_skip_provider "$PROVIDER"; then
echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)"
echo "---"
continue
fi

# Split models on "|"
IFS='|' read -ra MODELS <<< "$MODELS_STR"
for MODEL in "${MODELS[@]}"; do
export GOOSE_PROVIDER="$PROVIDER"
export GOOSE_MODEL="$MODEL"
TESTDIR=$(mktemp -d)
echo "hello" > "$TESTDIR/hello.txt"
echo "Provider: ${PROVIDER}"
echo "Model: ${MODEL}"
echo ""
TMPFILE=$(mktemp)
(cd "$TESTDIR" && "$SCRIPT_DIR/target/release/goose" run --text "Immediately use the shell tool to run 'ls'. Do not ask for confirmation." --with-builtin "$BUILTINS" 2>&1) | tee "$TMPFILE"
echo ""
if grep -qE "$SUCCESS_PATTERN" "$TMPFILE"; then
echo "✓ SUCCESS: Test passed - $SUCCESS_MSG"
RESULTS+=("✓ ${PROVIDER}: ${MODEL}")
else
if is_allowed_failure "$PROVIDER" "$MODEL"; then
echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG"
RESULTS+=("⚠ ${PROVIDER}: ${MODEL} (flaky)")
else
echo "✗ FAILED: Test failed - $FAILURE_MSG"
RESULTS+=("✗ ${PROVIDER}: ${MODEL}")
HARD_FAILURES+=("${PROVIDER}: ${MODEL}")
fi
JOBS+=("$PROVIDER|$MODEL|$job_index")
((job_index++))
done
done

total_jobs=${#JOBS[@]}
echo "Starting $total_jobs tests..."
echo ""

# Run first test sequentially if any jobs exist
if [ $total_jobs -gt 0 ]; then
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running one alone before the rest concurrently is kind of silly, but solves two small pain points. If you immediately start N concurrent sessions, then:

  • locally, you'll get the keychain password prompt N times, even if you click "always allow"
  • in CI, they all try to create the sqlite database at the same time, and that leads to things breaking. Maybe we should fix that but it also seems highly unlikely to happen in the real world

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's also useful since most likely if you break something it is broken for all providers. this way it tells you that immediately maybe

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah the faster inner loop - I think is reasonable

echo "Running first test sequentially..."
first_job="${JOBS[0]}"
IFS='|' read -r provider model idx <<< "$first_job"

result_file="$RESULTS_DIR/result_$idx"
output_file="$RESULTS_DIR/output_$idx"
meta_file="$RESULTS_DIR/meta_$idx"
echo "$provider|$model" > "$meta_file"
Comment on lines +295 to +296
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

meta_file and the corresponding write to $RESULTS_DIR/meta_$idx are never read anywhere, so this extra file creation is dead code and can be removed to simplify the parallel runner loop.

Suggested change
meta_file="$RESULTS_DIR/meta_$idx"
echo "$provider|$model" > "$meta_file"

Copilot uses AI. Check for mistakes.

# Run first test and wait for it to complete
run_test "$provider" "$model" "$result_file" "$output_file"
echo "First test completed."
echo ""
fi

# Run remaining tests in parallel
if [ $total_jobs -gt 1 ]; then
echo "Running remaining tests in parallel..."
running_jobs=0
for ((i=1; i<$total_jobs; i++)); do
job="${JOBS[$i]}"
IFS='|' read -r provider model idx <<< "$job"

result_file="$RESULTS_DIR/result_$idx"
output_file="$RESULTS_DIR/output_$idx"
meta_file="$RESULTS_DIR/meta_$idx"
echo "$provider|$model" > "$meta_file"

# Run test in background
run_test "$provider" "$model" "$result_file" "$output_file" &
((running_jobs++))

# Wait if we've hit the parallel limit
if [ $running_jobs -ge $MAX_PARALLEL ]; then
wait -n 2>/dev/null || wait
((running_jobs--))
fi
rm "$TMPFILE"
rm -rf "$TESTDIR"
echo "---"
done

# Wait for all remaining jobs
wait
fi

echo ""
echo "=== Test Results ==="
echo ""

# Collect results
RESULTS=()
HARD_FAILURES=()

for job in "${JOBS[@]}"; do
IFS='|' read -r provider model idx <<< "$job"

result_file="$RESULTS_DIR/result_$idx"
output_file="$RESULTS_DIR/output_$idx"

echo "Provider: $provider"
echo "Model: $model"
echo ""
cat "$output_file"
echo ""

if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then
echo "✓ SUCCESS: Test passed - $SUCCESS_MSG"
RESULTS+=("✓ ${provider}: ${model}")
else
if is_allowed_failure "$provider" "$model"; then
echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG"
RESULTS+=("⚠ ${provider}: ${model} (flaky)")
else
echo "✗ FAILED: Test failed - $FAILURE_MSG"
RESULTS+=("✗ ${provider}: ${model}")
HARD_FAILURES+=("${provider}: ${model}")
fi
fi
echo "---"
done

echo ""
echo "=== Test Summary ==="
for result in "${RESULTS[@]}"; do
Expand Down
Loading