From 162c73cd5eebf8e998ab96b12c1a2c8c85c6ea6a Mon Sep 17 00:00:00 2001 From: Lifei Zhou Date: Thu, 12 Feb 2026 15:36:19 +1100 Subject: [PATCH 1/5] refactored the test_providers scripts --- .github/workflows/pr-smoke-test.yml | 12 +- scripts/test_providers.sh | 445 ++++------------------------ scripts/test_providers_code_exec.sh | 53 ++++ scripts/test_providers_lib.sh | 245 +++++++++++++++ 4 files changed, 361 insertions(+), 394 deletions(-) create mode 100755 scripts/test_providers_code_exec.sh create mode 100755 scripts/test_providers_lib.sh diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml index 399278f78ebb..f6111dbc61f3 100644 --- a/.github/workflows/pr-smoke-test.yml +++ b/.github/workflows/pr-smoke-test.yml @@ -181,21 +181,11 @@ jobs: - name: Make Binary Executable run: chmod +x target/debug/goose - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '22' - - - name: Install agentic providers - run: npm install -g @anthropic-ai/claude-code @openai/codex @google/gemini-cli - - name: Run Provider Tests (Code Execution Mode) env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GEMINI_API_KEY: ${{ secrets.GOOGLE_API_KEY }} DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} @@ -208,7 +198,7 @@ jobs: run: | mkdir -p $HOME/.local/share/goose/sessions mkdir -p $HOME/.config/goose - bash scripts/test_providers.sh --code-exec + bash scripts/test_providers_code_exec.sh compaction-tests: name: Compaction Tests diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index b8a302c3e257..c01255435057 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -1,437 +1,116 @@ #!/bin/bash -# Test providers with optional code_execution mode -# Usage: -# ./test_providers.sh # Normal mode (direct tool calls) -# ./test_providers.sh --code-exec # Code execution mode (JS batching) +# Provider smoke tests - normal mode (direct tool calls) +# +# For each provider, asks goose to: +# 1. Run 'which ls' with empty PATH (tests PATH propagation via +# extend_path_with_shell, PR #7161) +# 2. Read a file via text_editor view (tests text_editor, PR #7167) +# Verifies the developer shell tool restores PATH from the user's shell. # # Environment variables: -# SKIP_PROVIDERS Comma-separated list of providers to skip (e.g., "tetrate,xai") +# SKIP_PROVIDERS Comma-separated list of providers to skip # SKIP_BUILD Skip the cargo build step if set -CODE_EXEC_MODE=false -for arg in "$@"; do - case $arg in - --code-exec) - CODE_EXEC_MODE=true - ;; - esac -done - -# Flaky models that are allowed to fail without failing the entire test run. -# These are typically preview/experimental models with inconsistent tool-calling behavior. -# Failures are still reported but don't block PRs. -ALLOWED_FAILURES=( - "google:gemini-2.5-flash" - "google:gemini-3-pro-preview" - "openrouter:nvidia/nemotron-3-nano-30b-a3b" - "openai:gpt-3.5-turbo" -) - -# Agentic providers handle tools internally and return text results. -# They can't produce the normal tool-call log patterns (e.g. "shell | developer"). -AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent") +LIB_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$LIB_DIR/test_providers_lib.sh" -if [ -f .env ]; then - export $(grep -v '^#' .env | xargs) -fi +echo "Mode: normal (direct tool calls)" +echo "" -if [ -z "$SKIP_BUILD" ]; then - echo "Building goose..." - cargo build --bin goose - echo "" -else - echo "Skipping build (SKIP_BUILD is set)..." - echo "" -fi +# --- Setup --- -SCRIPT_DIR=$(pwd) +GOOSE_BIN=$(build_goose) +BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager" -# Create a test file with known content in the current directory -# This cannot be /tmp as some agents cannot work outside the PWD +# Test content for agentic provider verification mkdir -p target TEST_CONTENT="test-content-abc123" TEST_FILE="./target/test-content.txt" echo "$TEST_CONTENT" > "$TEST_FILE" -# Format: "provider -> model1|model2|model3" -# Base providers that are always tested (with appropriate env vars) -PROVIDERS=( - "openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b" - "xai -> grok-3" - "openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5" - "anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805" - "google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview" - "tetrate -> claude-sonnet-4-20250514" -) - -# Conditionally add providers based on environment variables - -# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN -if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then - echo "✓ Including Databricks tests" - PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o") -else - echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)" -fi - -# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME -if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then - echo "✓ Including Azure OpenAI tests" - PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}") -else - echo "⚠️ Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)" -fi - -# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION -if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then - echo "✓ Including AWS Bedrock tests" - PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0") -else - echo "⚠️ Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)" -fi - -# GCP Vertex AI: requires GCP_PROJECT_ID -if [ -n "$GCP_PROJECT_ID" ]; then - echo "✓ Including GCP Vertex AI tests" - PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro") -else - echo "⚠️ Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)" -fi - -# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN -if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then - echo "✓ Including Snowflake tests" - PROVIDERS+=("snowflake -> claude-sonnet-4-5") -else - echo "⚠️ Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)" -fi - -# Venice: requires VENICE_API_KEY -if [ -n "$VENICE_API_KEY" ]; then - echo "✓ Including Venice tests" - PROVIDERS+=("venice -> llama-3.3-70b") -else - echo "⚠️ Skipping Venice tests (VENICE_API_KEY required)" -fi - -# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST) -if [ -n "$LITELLM_API_KEY" ]; then - echo "✓ Including LiteLLM tests" - PROVIDERS+=("litellm -> gpt-4o-mini") -else - echo "⚠️ Skipping LiteLLM tests (LITELLM_API_KEY required)" -fi - -# Ollama: requires OLLAMA_HOST (or uses default localhost:11434) -if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then - echo "✓ Including Ollama tests" - PROVIDERS+=("ollama -> qwen3") -else - echo "⚠️ Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)" -fi - -# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME -if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then - echo "✓ Including SageMaker TGI tests" - PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint") -else - echo "⚠️ Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)" -fi - -# GitHub Copilot: requires OAuth setup (check for cached token) -if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then - echo "✓ Including GitHub Copilot tests" - PROVIDERS+=("github_copilot -> gpt-4.1") -else - echo "⚠️ Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)" -fi - -# ChatGPT Codex: requires OAuth setup -if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then - echo "✓ Including ChatGPT Codex tests" - PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex") -else - echo "⚠️ Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)" -fi - -# CLI-based providers (require the CLI tool to be installed) - -# Claude Code CLI: requires 'claude' CLI tool -if command -v claude &> /dev/null; then - echo "✓ Including Claude Code CLI tests" - PROVIDERS+=("claude-code -> claude-sonnet-4-20250514") -else - echo "⚠️ Skipping Claude Code CLI tests ('claude' CLI tool required)" -fi - -# Codex CLI: requires 'codex' CLI tool -if command -v codex &> /dev/null; then - echo "✓ Including Codex CLI tests" - PROVIDERS+=("codex -> gpt-5.2-codex") -else - echo "⚠️ Skipping Codex CLI tests ('codex' CLI tool required)" -fi - -# Gemini CLI: requires 'gemini' CLI tool -if command -v gemini &> /dev/null; then - echo "✓ Including Gemini CLI tests" - PROVIDERS+=("gemini-cli -> gemini-2.5-pro") -else - echo "⚠️ Skipping Gemini CLI tests ('gemini' CLI tool required)" -fi - -# Cursor Agent: requires 'cursor-agent' CLI tool -if command -v cursor-agent &> /dev/null; then - echo "✓ Including Cursor Agent tests" - PROVIDERS+=("cursor-agent -> auto") -else - echo "⚠️ Skipping Cursor Agent tests ('cursor-agent' CLI tool required)" -fi - -echo "" - -# Configure mode-specific settings -if [ "$CODE_EXEC_MODE" = true ]; then - echo "Mode: code_execution (JS batching)" - BUILTINS="developer,code_execution" - # Match code_execution tool usage: - # - "execute | code_execution" or "get_function_details | code_execution" (fallback format) - # - "tool call | execute" or "tool calls | execute" (new format with tool_graph) - SUCCESS_PATTERN="(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)" - SUCCESS_MSG="code_execution tool called" - FAILURE_MSG="no code_execution tools called" -else - echo "Mode: normal (direct tool calls)" - BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager" - SUCCESS_PATTERN="shell \| developer" - SUCCESS_MSG="developer tool called" - FAILURE_MSG="no developer tools called" -fi -echo "" - -is_allowed_failure() { - local provider="$1" - local model="$2" - local key="${provider}:${model}" - for allowed in "${ALLOWED_FAILURES[@]}"; do - if [ "$allowed" = "$key" ]; then - return 0 - fi - done - return 1 -} - -should_skip_provider() { - local provider="$1" - if [ -z "$SKIP_PROVIDERS" ]; then - return 1 - fi - IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS" - for skip in "${SKIP_LIST[@]}"; do - # Trim whitespace - skip=$(echo "$skip" | xargs) - if [ "$skip" = "$provider" ]; then - return 0 - fi - done - return 1 -} - -is_agentic_provider() { - local provider="$1" - for agentic in "${AGENTIC_PROVIDERS[@]}"; do - if [ "$agentic" = "$provider" ]; then - return 0 - fi - done - return 1 -} - -# Create temp directory for results -RESULTS_DIR=$(mktemp -d) -trap "rm -rf $RESULTS_DIR" EXIT - -# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL) -MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)} -echo "Running tests with up to $MAX_PARALLEL parallel jobs" -echo "" +# --- Test case --- -# Function to run a single test run_test() { - local provider="$1" - local model="$2" - local result_file="$3" - local output_file="$4" - + local provider="$1" model="$2" result_file="$3" output_file="$4" local testdir=$(mktemp -d) - # Agentic providers use a file-read prompt with known content marker; - # regular providers use the shell prompt that produces tool-call logs. local prompt if is_agentic_provider "$provider"; then cp "$TEST_FILE" "$testdir/test-content.txt" prompt="read ./test-content.txt and output its contents exactly" else - echo "hello" > "$testdir/hello.txt" - prompt="Immediately use the shell tool to run 'ls'. Do not ask for confirmation." + echo "$TEST_CONTENT" > "$testdir/hello.txt" + prompt="read the ./hello.txt" fi - # Run the test and capture output ( export GOOSE_PROVIDER="$provider" export GOOSE_MODEL="$model" - cd "$testdir" && "$SCRIPT_DIR/target/debug/goose" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 + export PATH="" + cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 ) > "$output_file" 2>&1 - # Check result: agentic providers return text containing the test content - # instead of producing tool-call log patterns + # Verify: agentic providers must echo test content, + # regular providers must have 'which ls' resolve to an actual path if is_agentic_provider "$provider"; then if grep -qi "$TEST_CONTENT" "$output_file"; then - echo "success" > "$result_file" + echo "success|test content echoed back" > "$result_file" else - echo "failure" > "$result_file" + echo "failure|test content not found in output" > "$result_file" fi - elif grep -qE "$SUCCESS_PATTERN" "$output_file"; then - echo "success" > "$result_file" else - echo "failure" > "$result_file" + if grep -qE "/bin/ls|/usr/bin/ls" "$output_file"; then + echo "success|PATH propagated, which ls resolved" > "$result_file" + else + echo "failure|PATH not propagated, which ls did not resolve" > "$result_file" + fi fi rm -rf "$testdir" } -# Build list of all provider/model combinations -JOBS=() -job_index=0 -for provider_config in "${PROVIDERS[@]}"; do - PROVIDER="${provider_config%% -> *}" - MODELS_STR="${provider_config#* -> }" - - # Skip provider if it's in SKIP_PROVIDERS - if should_skip_provider "$PROVIDER"; then - echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)" - continue - fi - - # Agentic providers don't use goose's code_execution system - if [ "$CODE_EXEC_MODE" = true ] && is_agentic_provider "$PROVIDER"; then - echo "⊘ Skipping agentic provider in code_exec mode: ${PROVIDER}" - continue - fi - - IFS='|' read -ra MODELS <<< "$MODELS_STR" - for MODEL in "${MODELS[@]}"; do - JOBS+=("$PROVIDER|$MODEL|$job_index") - ((job_index++)) - done -done - -total_jobs=${#JOBS[@]} -echo "Starting $total_jobs tests..." -echo "" - -# Run first test sequentially if any jobs exist -if [ $total_jobs -gt 0 ]; then - echo "Running first test sequentially..." - first_job="${JOBS[0]}" - IFS='|' read -r provider model idx <<< "$first_job" +# --- Developer PATH propagation test (PR #7161) --- +# Runs once with a single provider. Empty PATH verifies that +# extend_path_with_shell restores it from the user's shell. - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" - meta_file="$RESULTS_DIR/meta_$idx" - echo "$provider|$model" > "$meta_file" +run_developer_path_test() { + local provider="anthropic" + local model="claude-sonnet-4-5-20250929" - # Run first test and wait for it to complete - run_test "$provider" "$model" "$result_file" "$output_file" - echo "First test completed." + echo "Provider: $provider Model: $model" echo "" -fi - -# Run remaining tests in parallel -if [ $total_jobs -gt 1 ]; then - echo "Running remaining tests in parallel..." - running_jobs=0 - for ((i=1; i<$total_jobs; i++)); do - job="${JOBS[$i]}" - IFS='|' read -r provider model idx <<< "$job" - - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" - meta_file="$RESULTS_DIR/meta_$idx" - echo "$provider|$model" > "$meta_file" - - # Run test in background - run_test "$provider" "$model" "$result_file" "$output_file" & - ((running_jobs++)) - # Wait if we've hit the parallel limit - if [ $running_jobs -ge $MAX_PARALLEL ]; then - wait -n 2>/dev/null || wait - ((running_jobs--)) - fi - done - - # Wait for all remaining jobs - wait -fi - -echo "" -echo "=== Test Results ===" -echo "" - -# Collect results -RESULTS=() -HARD_FAILURES=() - -for job in "${JOBS[@]}"; do - IFS='|' read -r provider model idx <<< "$job" + local prompt="use the developer tool to run 'which ls'. don't attempt with other approaches if you don't find it" + local testdir=$(mktemp -d) + local output_file="$testdir/output.txt" - result_file="$RESULTS_DIR/result_$idx" - output_file="$RESULTS_DIR/output_$idx" + ( + export GOOSE_PROVIDER="$provider" + export GOOSE_MODEL="$model" + export PATH="" + cd "$testdir" && "$GOOSE_BIN" run \ + --text "$prompt" \ + --with-builtin developer 2>&1 + ) > "$output_file" 2>&1 - echo "Provider: $provider" - echo "Model: $model" - echo "" cat "$output_file" echo "" - if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then - echo "✓ SUCCESS: Test passed - $SUCCESS_MSG" - RESULTS+=("✓ ${provider}: ${model}") + if grep -qE "/bin/ls|/usr/bin/ls|aliased to" "$output_file"; then + echo "✓ PASS: developer shell found 'ls' via PATH from user's shell" else - if is_allowed_failure "$provider" "$model"; then - echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG" - RESULTS+=("⚠ ${provider}: ${model} (flaky)") - else - echo "✗ FAILED: Test failed - $FAILURE_MSG" - RESULTS+=("✗ ${provider}: ${model}") - HARD_FAILURES+=("${provider}: ${model}") - fi + echo "✗ FAIL: developer shell could not find 'ls' — PATH not restored from user's shell" + HARD_FAILURES+=("developer-tool-path-test") fi echo "---" -done + echo "" -echo "" -echo "=== Test Summary ===" -for result in "${RESULTS[@]}"; do - echo "$result" -done + rm -rf "$testdir" +} -if [ ${#HARD_FAILURES[@]} -gt 0 ]; then - echo "" - echo "Hard failures (${#HARD_FAILURES[@]}):" - for failure in "${HARD_FAILURES[@]}"; do - echo " - $failure" - done - echo "" - echo "Some tests failed!" - exit 1 -else - if echo "${RESULTS[@]}" | grep -q "⚠"; then - echo "" - echo "All required tests passed! (some flaky tests failed but are allowed)" - else - echo "" - echo "All tests passed!" - fi -fi +# --- Run --- + +run_developer_path_test +# build_test_cases +# run_test_cases run_test +# report_results diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh new file mode 100755 index 000000000000..6952a38a95fe --- /dev/null +++ b/scripts/test_providers_code_exec.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Provider smoke tests - code execution mode (JS batching) +# +# For each provider, asks goose to run 'ls' via shell. +# Verifies the code_execution tool was invoked. +# Agentic providers are skipped (they don't use goose's code_execution system). +# +# Environment variables: +# SKIP_PROVIDERS Comma-separated list of providers to skip +# SKIP_BUILD Skip the cargo build step if set + +LIB_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$LIB_DIR/test_providers_lib.sh" + +echo "Mode: code_execution (JS batching)" +echo "" + +# --- Setup --- + +GOOSE_BIN=$(build_goose) +BUILTINS="developer,code_execution" + +# --- Test case --- + +run_test() { + local provider="$1" model="$2" result_file="$3" output_file="$4" + local testdir=$(mktemp -d) + + echo "hello" > "$testdir/hello.txt" + local prompt="Run 'ls' to list files in the current directory." + + # Run goose + ( + export GOOSE_PROVIDER="$provider" + export GOOSE_MODEL="$model" + cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 + ) > "$output_file" 2>&1 + + # Verify: code_execution tool must be called + # Matches: "execute | code_execution", "get_function_details | code_execution", + # "tool call | execute", "tool calls | execute" + if grep -qE "(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)" "$output_file"; then + echo "success|code_execution tool called" > "$result_file" + else + echo "failure|no code_execution tool calls found" > "$result_file" + fi + + rm -rf "$testdir" +} + +build_test_cases --skip-agentic +run_test_cases run_test +report_results diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh new file mode 100755 index 000000000000..b479a34f5541 --- /dev/null +++ b/scripts/test_providers_lib.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +PROVIDER_CONFIG=' +openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b +xai -> grok-3 +openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5 +anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805 +google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview +tetrate -> claude-sonnet-4-20250514 +databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o +azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME} +aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0 +gcp_vertex_ai -> gemini-2.5-pro +snowflake -> claude-sonnet-4-5 +venice -> llama-3.3-70b +litellm -> gpt-4o-mini +sagemaker_tgi -> sagemaker-tgi-endpoint +github_copilot -> gpt-4.1 +chatgpt_codex -> gpt-5.1-codex +claude-code -> claude-sonnet-4-20250514 +codex -> gpt-5.2-codex +gemini-cli -> gemini-2.5-pro +cursor-agent -> auto +ollama -> qwen3 +' + +# Flaky models allowed to fail without blocking PRs. +ALLOWED_FAILURES=( + "google:gemini-2.5-flash" + "google:gemini-3-pro-preview" + "openrouter:nvidia/nemotron-3-nano-30b-a3b" + "openai:gpt-3.5-turbo" +) + +AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent") + +if [ -f .env ]; then + export $(grep -v '^#' .env | xargs) +fi + +build_goose() { + if [ -z "$SKIP_BUILD" ]; then + echo "Building goose..." >&2 + cargo build --bin goose >&2 + echo "" >&2 + else + echo "Skipping build (SKIP_BUILD is set)..." >&2 + echo "" >&2 + fi + + echo "$(pwd)/target/debug/goose" +} + +has_env() { [ -n "${!1}" ]; } +has_cmd() { command -v "$1" &>/dev/null; } +has_file() { [ -f "$1" ]; } + +is_provider_available() { + case "$1" in + openrouter) has_env OPENROUTER_API_KEY ;; + xai) has_env XAI_API_KEY ;; + openai) has_env OPENAI_API_KEY ;; + anthropic) has_env ANTHROPIC_API_KEY ;; + google) has_env GOOGLE_API_KEY ;; + tetrate) has_env TETRATE_API_KEY ;; + databricks) has_env DATABRICKS_HOST && has_env DATABRICKS_TOKEN ;; + azure_openai) has_env AZURE_OPENAI_ENDPOINT && has_env AZURE_OPENAI_DEPLOYMENT_NAME ;; + aws_bedrock) has_env AWS_REGION && { has_env AWS_PROFILE || has_env AWS_ACCESS_KEY_ID; } ;; + gcp_vertex_ai) has_env GCP_PROJECT_ID ;; + snowflake) has_env SNOWFLAKE_HOST && has_env SNOWFLAKE_TOKEN ;; + venice) has_env VENICE_API_KEY ;; + litellm) has_env LITELLM_API_KEY ;; + sagemaker_tgi) has_env SAGEMAKER_ENDPOINT_NAME && has_env AWS_REGION ;; + github_copilot) has_env GITHUB_COPILOT_TOKEN || has_file "$HOME/.config/goose/github_copilot_token.json" ;; + chatgpt_codex) has_env CHATGPT_CODEX_TOKEN || has_file "$HOME/.config/goose/chatgpt_codex_token.json" ;; + ollama) has_env OLLAMA_HOST || has_cmd ollama ;; + claude-code) has_cmd claude ;; + codex) has_cmd codex ;; + gemini-cli) has_cmd gemini ;; + cursor-agent) has_cmd cursor-agent ;; + *) return 0 ;; + esac +} + +is_allowed_failure() { + local key="${1}:${2}" + for allowed in "${ALLOWED_FAILURES[@]}"; do + [ "$allowed" = "$key" ] && return 0 + done + return 1 +} + +should_skip_provider() { + [ -z "$SKIP_PROVIDERS" ] && return 1 + IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS" + for skip in "${SKIP_LIST[@]}"; do + skip=$(echo "$skip" | xargs) + [ "$skip" = "$1" ] && return 0 + done + return 1 +} + +is_agentic_provider() { + for agentic in "${AGENTIC_PROVIDERS[@]}"; do + [ "$agentic" = "$1" ] && return 0 + done + return 1 +} + +# build_test_cases [--skip-agentic] +build_test_cases() { + local skip_agentic=false + [ "$1" = "--skip-agentic" ] && skip_agentic=true + + local providers=() + while IFS= read -r line; do + [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue + local provider="${line%% -> *}" + if is_provider_available "$provider"; then + providers+=("$(eval echo "\"$line\"")") + echo "✓ Including $provider" + else + echo "⚠️ Skipping $provider (prerequisites not met)" + fi + done <<< "$PROVIDER_CONFIG" + echo "" + + TEST_CASES=() + local job_index=0 + for provider_config in "${providers[@]}"; do + local provider="${provider_config%% -> *}" + local models_str="${provider_config#* -> }" + + if should_skip_provider "$provider"; then + echo "⊘ Skipping provider: ${provider} (SKIP_PROVIDERS)" + continue + fi + + if [ "$skip_agentic" = true ] && is_agentic_provider "$provider"; then + echo "⊘ Skipping agentic provider: ${provider}" + continue + fi + + IFS='|' read -ra models <<< "$models_str" + for model in "${models[@]}"; do + TEST_CASES+=("$provider|$model|$job_index") + ((job_index++)) + done + done +} + +# run_test_cases +run_test_cases() { + local test_fn="$1" + + RESULTS_DIR=$(mktemp -d) + trap "rm -rf $RESULTS_DIR ${CLEANUP_DIR:-}" EXIT + MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)} + echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)" + echo "" + + local running=0 + for ((i=0; i<${#TEST_CASES[@]}; i++)); do + IFS='|' read -r provider model idx <<< "${TEST_CASES[$i]}" + + if [ $i -eq 0 ]; then + # First test runs sequentially to catch early failures + "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" + else + "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" & + ((running++)) + if [ $running -ge $MAX_PARALLEL ]; then + wait -n 2>/dev/null || wait + ((running--)) + fi + fi + done + wait +} + +# report_results +# Result files use format: "success|message" or "failure|message" +report_results() { + echo "" + echo "=== Test Results ===" + echo "" + + RESULTS=() + HARD_FAILURES=() + + for job in "${TEST_CASES[@]}"; do + IFS='|' read -r provider model idx <<< "$job" + + echo "Provider: $provider" + echo "Model: $model" + echo "" + cat "$RESULTS_DIR/output_$idx" + echo "" + + local result_line="" + [ -f "$RESULTS_DIR/result_$idx" ] && result_line=$(cat "$RESULTS_DIR/result_$idx") + local status="${result_line%%|*}" + local msg="${result_line#*|}" + + if [ "$status" = "success" ]; then + echo "✓ SUCCESS: $msg" + RESULTS+=("✓ ${provider}: ${model}") + else + if is_allowed_failure "$provider" "$model"; then + echo "⚠ FLAKY: $msg" + RESULTS+=("⚠ ${provider}: ${model} (flaky)") + else + echo "✗ FAILED: $msg" + RESULTS+=("✗ ${provider}: ${model}") + HARD_FAILURES+=("${provider}: ${model}") + fi + fi + echo "---" + done + + echo "" + echo "=== Test Summary ===" + for result in "${RESULTS[@]}"; do + echo "$result" + done + + if [ ${#HARD_FAILURES[@]} -gt 0 ]; then + echo "" + echo "Hard failures (${#HARD_FAILURES[@]}):" + for failure in "${HARD_FAILURES[@]}"; do + echo " - $failure" + done + echo "" + echo "Some tests failed!" + exit 1 + else + if echo "${RESULTS[@]}" | grep -q "⚠"; then + echo "" + echo "All required tests passed! (some flaky tests failed but are allowed)" + else + echo "" + echo "All tests passed!" + fi + fi +} From 520c53adefd835a7be39e52d57b26bec8e9e4116 Mon Sep 17 00:00:00 2001 From: Lifei Zhou Date: Thu, 12 Feb 2026 17:24:45 +1100 Subject: [PATCH 2/5] added test for develop editor view --- scripts/test_providers.sh | 79 ++++------------------------- scripts/test_providers_code_exec.sh | 8 --- 2 files changed, 10 insertions(+), 77 deletions(-) diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index c01255435057..e0e8e837f17c 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -1,15 +1,4 @@ #!/bin/bash -# Provider smoke tests - normal mode (direct tool calls) -# -# For each provider, asks goose to: -# 1. Run 'which ls' with empty PATH (tests PATH propagation via -# extend_path_with_shell, PR #7161) -# 2. Read a file via text_editor view (tests text_editor, PR #7167) -# Verifies the developer shell tool restores PATH from the user's shell. -# -# Environment variables: -# SKIP_PROVIDERS Comma-separated list of providers to skip -# SKIP_BUILD Skip the cargo build step if set LIB_DIR="$(cd "$(dirname "$0")" && pwd)" source "$LIB_DIR/test_providers_lib.sh" @@ -18,18 +7,14 @@ echo "Mode: normal (direct tool calls)" echo "" # --- Setup --- - GOOSE_BIN=$(build_goose) -BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager" +BUILTINS="developer" -# Test content for agentic provider verification mkdir -p target TEST_CONTENT="test-content-abc123" TEST_FILE="./target/test-content.txt" echo "$TEST_CONTENT" > "$TEST_FILE" -# --- Test case --- - run_test() { local provider="$1" model="$2" result_file="$3" output_file="$4" local testdir=$(mktemp -d) @@ -40,7 +25,7 @@ run_test() { prompt="read ./test-content.txt and output its contents exactly" else echo "$TEST_CONTENT" > "$testdir/hello.txt" - prompt="read the ./hello.txt" + prompt="Use the text_editor view command to read ./hello.txt, then output its contents in UPPERCASE don't use any other tool in Developer" fi ( @@ -50,67 +35,23 @@ run_test() { cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 ) > "$output_file" 2>&1 - # Verify: agentic providers must echo test content, - # regular providers must have 'which ls' resolve to an actual path if is_agentic_provider "$provider"; then if grep -qi "$TEST_CONTENT" "$output_file"; then - echo "success|test content echoed back" > "$result_file" + echo "success|test content found by model" > "$result_file" else - echo "failure|test content not found in output" > "$result_file" + echo "failure|test content not found by model" > "$result_file" fi else - if grep -qE "/bin/ls|/usr/bin/ls" "$output_file"; then - echo "success|PATH propagated, which ls resolved" > "$result_file" + if grep -q "TEST-CONTENT-ABC123" "$output_file"; then + echo "success|model read and uppercased file content" > "$result_file" else - echo "failure|PATH not propagated, which ls did not resolve" > "$result_file" + echo "failure|model did not return uppercased file content" > "$result_file" fi fi rm -rf "$testdir" } -# --- Developer PATH propagation test (PR #7161) --- -# Runs once with a single provider. Empty PATH verifies that -# extend_path_with_shell restores it from the user's shell. - -run_developer_path_test() { - local provider="anthropic" - local model="claude-sonnet-4-5-20250929" - - echo "Provider: $provider Model: $model" - echo "" - - local prompt="use the developer tool to run 'which ls'. don't attempt with other approaches if you don't find it" - local testdir=$(mktemp -d) - local output_file="$testdir/output.txt" - - ( - export GOOSE_PROVIDER="$provider" - export GOOSE_MODEL="$model" - export PATH="" - cd "$testdir" && "$GOOSE_BIN" run \ - --text "$prompt" \ - --with-builtin developer 2>&1 - ) > "$output_file" 2>&1 - - cat "$output_file" - echo "" - - if grep -qE "/bin/ls|/usr/bin/ls|aliased to" "$output_file"; then - echo "✓ PASS: developer shell found 'ls' via PATH from user's shell" - else - echo "✗ FAIL: developer shell could not find 'ls' — PATH not restored from user's shell" - HARD_FAILURES+=("developer-tool-path-test") - fi - echo "---" - echo "" - - rm -rf "$testdir" -} - -# --- Run --- - -run_developer_path_test -# build_test_cases -# run_test_cases run_test -# report_results +build_test_cases +run_test_cases run_test +report_results diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh index 6952a38a95fe..d0737c37cef6 100755 --- a/scripts/test_providers_code_exec.sh +++ b/scripts/test_providers_code_exec.sh @@ -1,13 +1,5 @@ #!/bin/bash # Provider smoke tests - code execution mode (JS batching) -# -# For each provider, asks goose to run 'ls' via shell. -# Verifies the code_execution tool was invoked. -# Agentic providers are skipped (they don't use goose's code_execution system). -# -# Environment variables: -# SKIP_PROVIDERS Comma-separated list of providers to skip -# SKIP_BUILD Skip the cargo build step if set LIB_DIR="$(cd "$(dirname "$0")" && pwd)" source "$LIB_DIR/test_providers_lib.sh" From 0a1a58e3f130ffef55046b93eb95b5342fa40187 Mon Sep 17 00:00:00 2001 From: Lifei Zhou Date: Thu, 12 Feb 2026 19:11:36 +1100 Subject: [PATCH 3/5] fixed the test --- scripts/test_providers.sh | 1 - scripts/test_providers_lib.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index e0e8e837f17c..076e2e44f25c 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -31,7 +31,6 @@ run_test() { ( export GOOSE_PROVIDER="$provider" export GOOSE_MODEL="$model" - export PATH="" cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1 ) > "$output_file" 2>&1 diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh index b479a34f5541..8c4dad58dc08 100755 --- a/scripts/test_providers_lib.sh +++ b/scripts/test_providers_lib.sh @@ -4,7 +4,7 @@ PROVIDER_CONFIG=' openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b xai -> grok-3 openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5 -anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805 +anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101 google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview tetrate -> claude-sonnet-4-20250514 databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o From 95125c81dfa4b0cae946c86a5e2de1c9038a7d4b Mon Sep 17 00:00:00 2001 From: Lifei Zhou Date: Thu, 12 Feb 2026 19:37:16 +1100 Subject: [PATCH 4/5] tune the prompt and clean up --- scripts/test_providers.sh | 13 +++++++------ scripts/test_providers_lib.sh | 8 ++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh index 076e2e44f25c..203cef6bf992 100755 --- a/scripts/test_providers.sh +++ b/scripts/test_providers.sh @@ -6,7 +6,6 @@ source "$LIB_DIR/test_providers_lib.sh" echo "Mode: normal (direct tool calls)" echo "" -# --- Setup --- GOOSE_BIN=$(build_goose) BUILTINS="developer" @@ -24,8 +23,8 @@ run_test() { cp "$TEST_FILE" "$testdir/test-content.txt" prompt="read ./test-content.txt and output its contents exactly" else - echo "$TEST_CONTENT" > "$testdir/hello.txt" - prompt="Use the text_editor view command to read ./hello.txt, then output its contents in UPPERCASE don't use any other tool in Developer" + echo "$TEST_CONTENT" > "$testdir/input.txt" + prompt="Use the text_editor view command to read ./input.txt, then output this file's contents in UPPERCASE. Do NOT use any other tool in Developer" fi ( @@ -41,10 +40,12 @@ run_test() { echo "failure|test content not found by model" > "$result_file" fi else - if grep -q "TEST-CONTENT-ABC123" "$output_file"; then - echo "success|model read and uppercased file content" > "$result_file" - else + if ! grep -q "text_editor | developer" "$output_file"; then + echo "failure|model did not use text_editor tool" > "$result_file" + elif ! grep -q "TEST-CONTENT-ABC123" "$output_file"; then echo "failure|model did not return uppercased file content" > "$result_file" + else + echo "success|model read and uppercased file content" > "$result_file" fi fi diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh index 8c4dad58dc08..0912e64aca66 100755 --- a/scripts/test_providers_lib.sh +++ b/scripts/test_providers_lib.sh @@ -1,6 +1,6 @@ #!/bin/bash -PROVIDER_CONFIG=' +PROVIDER_CONFIG=" openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b xai -> grok-3 openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5 @@ -22,7 +22,7 @@ codex -> gpt-5.2-codex gemini-cli -> gemini-2.5-pro cursor-agent -> auto ollama -> qwen3 -' +" # Flaky models allowed to fail without blocking PRs. ALLOWED_FAILURES=( @@ -117,7 +117,7 @@ build_test_cases() { [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue local provider="${line%% -> *}" if is_provider_available "$provider"; then - providers+=("$(eval echo "\"$line\"")") + providers+=("$line") echo "✓ Including $provider" else echo "⚠️ Skipping $provider (prerequisites not met)" @@ -154,7 +154,7 @@ run_test_cases() { local test_fn="$1" RESULTS_DIR=$(mktemp -d) - trap "rm -rf $RESULTS_DIR ${CLEANUP_DIR:-}" EXIT + trap 'if [ -n "${RESULTS_DIR:-}" ]; then rm -rf -- "$RESULTS_DIR"; fi; if [ -n "${CLEANUP_DIR:-}" ]; then rm -rf -- "$CLEANUP_DIR"; fi' EXIT MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)} echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)" echo "" From f1baebdb2ae0531e0eb2901a98af52b00636b11f Mon Sep 17 00:00:00 2001 From: Lifei Zhou Date: Thu, 12 Feb 2026 19:38:08 +1100 Subject: [PATCH 5/5] more cleanup --- scripts/test_providers_lib.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh index 0912e64aca66..b56f13a89982 100755 --- a/scripts/test_providers_lib.sh +++ b/scripts/test_providers_lib.sh @@ -178,8 +178,6 @@ run_test_cases() { wait } -# report_results -# Result files use format: "success|message" or "failure|message" report_results() { echo "" echo "=== Test Results ==="