diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index a0fdc41a2204..60bc4c3fb8fa 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -41,6 +41,7 @@ fi SCRIPT_DIR=$(pwd) # Format: "provider -> model1|model2|model3" +# Base providers that are always tested (with appropriate env vars) PROVIDERS=( "openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b" "xai -> grok-3" @@ -50,20 +51,132 @@ PROVIDERS=( "tetrate -> claude-sonnet-4-20250514" ) -# In CI, only run Databricks tests if DATABRICKS_HOST and DATABRICKS_TOKEN are set -# Locally, always run Databricks tests -if [ -n "$CI" ]; then - if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then - echo "✓ Including Databricks tests" - PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o") - else - echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required in CI)" - fi -else +# Conditionally add providers based on environment variables + +# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN +if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then echo "✓ Including Databricks tests" PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o") +else + echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)" +fi + +# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME +if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then + echo "✓ Including Azure OpenAI tests" + PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}") +else + echo "⚠️ Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)" +fi + +# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION +if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then + echo "✓ Including AWS Bedrock tests" + PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0") +else + echo "⚠️ Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)" +fi + +# GCP Vertex AI: requires GCP_PROJECT_ID +if [ -n "$GCP_PROJECT_ID" ]; then + echo "✓ Including GCP Vertex AI tests" + PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro") +else + echo "⚠️ Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)" +fi + +# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN +if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then + echo "✓ Including Snowflake tests" + PROVIDERS+=("snowflake -> claude-sonnet-4-5") +else + echo "⚠️ Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)" +fi + +# Venice: requires VENICE_API_KEY +if [ -n "$VENICE_API_KEY" ]; then + echo "✓ Including Venice tests" + PROVIDERS+=("venice -> llama-3.3-70b") +else + echo "⚠️ Skipping Venice tests (VENICE_API_KEY required)" +fi + +# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST) +if [ -n "$LITELLM_API_KEY" ]; then + echo "✓ Including LiteLLM tests" + PROVIDERS+=("litellm -> gpt-4o-mini") +else + echo "⚠️ Skipping LiteLLM tests (LITELLM_API_KEY required)" +fi + +# Ollama: requires OLLAMA_HOST (or uses default localhost:11434) +if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then + echo "✓ Including Ollama tests" + PROVIDERS+=("ollama -> qwen3") +else + echo "⚠️ Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)" +fi + +# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME +if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then + echo "✓ Including SageMaker TGI tests" + PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint") +else + echo "⚠️ Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)" +fi + +# GitHub Copilot: requires OAuth setup (check for cached token) +if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then + echo "✓ Including GitHub Copilot tests" + PROVIDERS+=("github_copilot -> gpt-4.1") +else + echo "⚠️ Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)" fi +# ChatGPT Codex: requires OAuth setup +if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then + echo "✓ Including ChatGPT Codex tests" + PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex") +else + echo "⚠️ Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)" +fi + +# CLI-based providers (require the CLI tool to be installed) + +# Claude Code CLI: requires 'claude' CLI tool +if command -v claude &> /dev/null; then + echo "✓ Including Claude Code CLI tests" + PROVIDERS+=("claude-code -> claude-sonnet-4-20250514") +else + echo "⚠️ Skipping Claude Code CLI tests ('claude' CLI tool required)" +fi + +# Codex CLI: requires 'codex' CLI tool +if command -v codex &> /dev/null; then + echo "✓ Including Codex CLI tests" + PROVIDERS+=("codex -> gpt-5.2-codex") +else + echo "⚠️ Skipping Codex CLI tests ('codex' CLI tool required)" +fi + +# Gemini CLI: requires 'gemini' CLI tool +if command -v gemini &> /dev/null; then + echo "✓ Including Gemini CLI tests" + PROVIDERS+=("gemini-cli -> gemini-2.5-pro") +else + echo "⚠️ Skipping Gemini CLI tests ('gemini' CLI tool required)" +fi + +# Cursor Agent: requires 'cursor-agent' CLI tool +if command -v cursor-agent &> /dev/null; then + echo "✓ Including Cursor Agent tests" + PROVIDERS+=("cursor-agent -> auto") +else + echo "⚠️ Skipping Cursor Agent tests ('cursor-agent' CLI tool required)" +fi + +echo "" + # Configure mode-specific settings if [ "$CODE_EXEC_MODE" = true ]; then echo "Mode: code_execution (JS batching)" @@ -111,52 +224,147 @@ should_skip_provider() { return 1 } -RESULTS=() -HARD_FAILURES=() +# Create temp directory for results +RESULTS_DIR=$(mktemp -d) +trap "rm -rf $RESULTS_DIR" EXIT + +# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL) +MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)} +echo "Running tests with up to $MAX_PARALLEL parallel jobs" +echo "" + +# Function to run a single test +run_test() { + local provider="$1" + local model="$2" + local result_file="$3" + local output_file="$4" + local testdir=$(mktemp -d) + echo "hello" > "$testdir/hello.txt" + + # Run the test and capture output + ( + export GOOSE_PROVIDER="$provider" + export GOOSE_MODEL="$model" + cd "$testdir" && "$SCRIPT_DIR/target/release/goose" run --text "Immediately use the shell tool to run 'ls'. Do not ask for confirmation." --with-builtin "$BUILTINS" 2>&1 + ) > "$output_file" 2>&1 + + # Check result + if grep -qE "$SUCCESS_PATTERN" "$output_file"; then + echo "success" > "$result_file" + else + echo "failure" > "$result_file" + fi + + rm -rf "$testdir" +} + +# Build list of all provider/model combinations +JOBS=() +job_index=0 for provider_config in "${PROVIDERS[@]}"; do - # Split on " -> " to get provider and models PROVIDER="${provider_config%% -> *}" MODELS_STR="${provider_config#* -> }" # Skip provider if it's in SKIP_PROVIDERS if should_skip_provider "$PROVIDER"; then echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)" - echo "---" continue fi - # Split models on "|" IFS='|' read -ra MODELS <<< "$MODELS_STR" for MODEL in "${MODELS[@]}"; do - export GOOSE_PROVIDER="$PROVIDER" - export GOOSE_MODEL="$MODEL" - TESTDIR=$(mktemp -d) - echo "hello" > "$TESTDIR/hello.txt" - echo "Provider: ${PROVIDER}" - echo "Model: ${MODEL}" - echo "" - TMPFILE=$(mktemp) - (cd "$TESTDIR" && "$SCRIPT_DIR/target/release/goose" run --text "Immediately use the shell tool to run 'ls'. Do not ask for confirmation." --with-builtin "$BUILTINS" 2>&1) | tee "$TMPFILE" - echo "" - if grep -qE "$SUCCESS_PATTERN" "$TMPFILE"; then - echo "✓ SUCCESS: Test passed - $SUCCESS_MSG" - RESULTS+=("✓ ${PROVIDER}: ${MODEL}") - else - if is_allowed_failure "$PROVIDER" "$MODEL"; then - echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG" - RESULTS+=("⚠ ${PROVIDER}: ${MODEL} (flaky)") - else - echo "✗ FAILED: Test failed - $FAILURE_MSG" - RESULTS+=("✗ ${PROVIDER}: ${MODEL}") - HARD_FAILURES+=("${PROVIDER}: ${MODEL}") - fi + JOBS+=("$PROVIDER|$MODEL|$job_index") + ((job_index++)) + done +done + +total_jobs=${#JOBS[@]} +echo "Starting $total_jobs tests..." +echo "" + +# Run first test sequentially if any jobs exist +if [ $total_jobs -gt 0 ]; then + echo "Running first test sequentially..." + first_job="${JOBS[0]}" + IFS='|' read -r provider model idx <<< "$first_job" + + result_file="$RESULTS_DIR/result_$idx" + output_file="$RESULTS_DIR/output_$idx" + meta_file="$RESULTS_DIR/meta_$idx" + echo "$provider|$model" > "$meta_file" + + # Run first test and wait for it to complete + run_test "$provider" "$model" "$result_file" "$output_file" + echo "First test completed." + echo "" +fi + +# Run remaining tests in parallel +if [ $total_jobs -gt 1 ]; then + echo "Running remaining tests in parallel..." + running_jobs=0 + for ((i=1; i<$total_jobs; i++)); do + job="${JOBS[$i]}" + IFS='|' read -r provider model idx <<< "$job" + + result_file="$RESULTS_DIR/result_$idx" + output_file="$RESULTS_DIR/output_$idx" + meta_file="$RESULTS_DIR/meta_$idx" + echo "$provider|$model" > "$meta_file" + + # Run test in background + run_test "$provider" "$model" "$result_file" "$output_file" & + ((running_jobs++)) + + # Wait if we've hit the parallel limit + if [ $running_jobs -ge $MAX_PARALLEL ]; then + wait -n 2>/dev/null || wait + ((running_jobs--)) fi - rm "$TMPFILE" - rm -rf "$TESTDIR" - echo "---" done + + # Wait for all remaining jobs + wait +fi + +echo "" +echo "=== Test Results ===" +echo "" + +# Collect results +RESULTS=() +HARD_FAILURES=() + +for job in "${JOBS[@]}"; do + IFS='|' read -r provider model idx <<< "$job" + + result_file="$RESULTS_DIR/result_$idx" + output_file="$RESULTS_DIR/output_$idx" + + echo "Provider: $provider" + echo "Model: $model" + echo "" + cat "$output_file" + echo "" + + if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then + echo "✓ SUCCESS: Test passed - $SUCCESS_MSG" + RESULTS+=("✓ ${provider}: ${model}") + else + if is_allowed_failure "$provider" "$model"; then + echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG" + RESULTS+=("⚠ ${provider}: ${model} (flaky)") + else + echo "✗ FAILED: Test failed - $FAILURE_MSG" + RESULTS+=("✗ ${provider}: ${model}") + HARD_FAILURES+=("${provider}: ${model}") + fi + fi + echo "---" done + echo "" echo "=== Test Summary ===" for result in "${RESULTS[@]}"; do