diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml
index 399278f78eb..f6111dbc61f 100644
--- a/.github/workflows/pr-smoke-test.yml
+++ b/.github/workflows/pr-smoke-test.yml
@@ -181,21 +181,11 @@ jobs:
       - name: Make Binary Executable
         run: chmod +x target/debug/goose
 
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-
-      - name: Install agentic providers
-        run: npm install -g @anthropic-ai/claude-code @openai/codex @google/gemini-cli
-
       - name: Run Provider Tests (Code Execution Mode)
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
-          GEMINI_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
           DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}
           DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
@@ -208,7 +198,7 @@ jobs:
         run: |
           mkdir -p $HOME/.local/share/goose/sessions
           mkdir -p $HOME/.config/goose
-          bash scripts/test_providers.sh --code-exec
+          bash scripts/test_providers_code_exec.sh
 
   compaction-tests:
     name: Compaction Tests
diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh
index b8a302c3e25..203cef6bf99 100755
--- a/scripts/test_providers.sh
+++ b/scripts/test_providers.sh
@@ -1,437 +1,57 @@
 #!/bin/bash
-# Test providers with optional code_execution mode
-# Usage:
-#   ./test_providers.sh              # Normal mode (direct tool calls)
-#   ./test_providers.sh --code-exec  # Code execution mode (JS batching)
-#
-# Environment variables:
-#   SKIP_PROVIDERS  Comma-separated list of providers to skip (e.g., "tetrate,xai")
-#   SKIP_BUILD      Skip the cargo build step if set
 
-CODE_EXEC_MODE=false
-for arg in "$@"; do
-  case $arg in
-    --code-exec)
-      CODE_EXEC_MODE=true
-      ;;
-  esac
-done
+LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
+source "$LIB_DIR/test_providers_lib.sh"
 
-# Flaky models that are allowed to fail without failing the entire test run.
-# These are typically preview/experimental models with inconsistent tool-calling behavior.
-# Failures are still reported but don't block PRs.
-ALLOWED_FAILURES=(
-  "google:gemini-2.5-flash"
-  "google:gemini-3-pro-preview"
-  "openrouter:nvidia/nemotron-3-nano-30b-a3b"
-  "openai:gpt-3.5-turbo"
-)
-
-# Agentic providers handle tools internally and return text results.
-# They can't produce the normal tool-call log patterns (e.g. "shell | developer").
-AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent")
-
-if [ -f .env ]; then
-  export $(grep -v '^#' .env | xargs)
-fi
-
-if [ -z "$SKIP_BUILD" ]; then
-  echo "Building goose..."
-  cargo build --bin goose
-  echo ""
-else
-  echo "Skipping build (SKIP_BUILD is set)..."
-  echo ""
-fi
+echo "Mode: normal (direct tool calls)"
+echo ""
 
-SCRIPT_DIR=$(pwd)
+GOOSE_BIN=$(build_goose)
+BUILTINS="developer"
 
-# Create a test file with known content in the current directory
-# This cannot be /tmp as some agents cannot work outside the PWD
 mkdir -p target
 TEST_CONTENT="test-content-abc123"
 TEST_FILE="./target/test-content.txt"
 echo "$TEST_CONTENT" > "$TEST_FILE"
 
-# Format: "provider -> model1|model2|model3"
-# Base providers that are always tested (with appropriate env vars)
-PROVIDERS=(
-  "openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b"
-  "xai -> grok-3"
-  "openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5"
-  "anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805"
-  "google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview"
-  "tetrate -> claude-sonnet-4-20250514"
-)
-
-# Conditionally add providers based on environment variables
-
-# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN
-if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then
-  echo "✓ Including Databricks tests"
-  PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o")
-else
-  echo "⚠️  Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)"
-fi
-
-# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME
-if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then
-  echo "✓ Including Azure OpenAI tests"
-  PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}")
-else
-  echo "⚠️  Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)"
-fi
-
-# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION
-if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then
-  echo "✓ Including AWS Bedrock tests"
-  PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0")
-else
-  echo "⚠️  Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)"
-fi
-
-# GCP Vertex AI: requires GCP_PROJECT_ID
-if [ -n "$GCP_PROJECT_ID" ]; then
-  echo "✓ Including GCP Vertex AI tests"
-  PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro")
-else
-  echo "⚠️  Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)"
-fi
-
-# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN
-if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then
-  echo "✓ Including Snowflake tests"
-  PROVIDERS+=("snowflake -> claude-sonnet-4-5")
-else
-  echo "⚠️  Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)"
-fi
-
-# Venice: requires VENICE_API_KEY
-if [ -n "$VENICE_API_KEY" ]; then
-  echo "✓ Including Venice tests"
-  PROVIDERS+=("venice -> llama-3.3-70b")
-else
-  echo "⚠️  Skipping Venice tests (VENICE_API_KEY required)"
-fi
-
-# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST)
-if [ -n "$LITELLM_API_KEY" ]; then
-  echo "✓ Including LiteLLM tests"
-  PROVIDERS+=("litellm -> gpt-4o-mini")
-else
-  echo "⚠️  Skipping LiteLLM tests (LITELLM_API_KEY required)"
-fi
-
-# Ollama: requires OLLAMA_HOST (or uses default localhost:11434)
-if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then
-  echo "✓ Including Ollama tests"
-  PROVIDERS+=("ollama -> qwen3")
-else
-  echo "⚠️  Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)"
-fi
-
-# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME
-if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then
-  echo "✓ Including SageMaker TGI tests"
-  PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint")
-else
-  echo "⚠️  Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)"
-fi
-
-# GitHub Copilot: requires OAuth setup (check for cached token)
-if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then
-  echo "✓ Including GitHub Copilot tests"
-  PROVIDERS+=("github_copilot -> gpt-4.1")
-else
-  echo "⚠️  Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)"
-fi
-
-# ChatGPT Codex: requires OAuth setup
-if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then
-  echo "✓ Including ChatGPT Codex tests"
-  PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex")
-else
-  echo "⚠️  Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)"
-fi
-
-# CLI-based providers (require the CLI tool to be installed)
-
-# Claude Code CLI: requires 'claude' CLI tool
-if command -v claude &> /dev/null; then
-  echo "✓ Including Claude Code CLI tests"
-  PROVIDERS+=("claude-code -> claude-sonnet-4-20250514")
-else
-  echo "⚠️  Skipping Claude Code CLI tests ('claude' CLI tool required)"
-fi
-
-# Codex CLI: requires 'codex' CLI tool
-if command -v codex &> /dev/null; then
-  echo "✓ Including Codex CLI tests"
-  PROVIDERS+=("codex -> gpt-5.2-codex")
-else
-  echo "⚠️  Skipping Codex CLI tests ('codex' CLI tool required)"
-fi
-
-# Gemini CLI: requires 'gemini' CLI tool
-if command -v gemini &> /dev/null; then
-  echo "✓ Including Gemini CLI tests"
-  PROVIDERS+=("gemini-cli -> gemini-2.5-pro")
-else
-  echo "⚠️  Skipping Gemini CLI tests ('gemini' CLI tool required)"
-fi
-
-# Cursor Agent: requires 'cursor-agent' CLI tool
-if command -v cursor-agent &> /dev/null; then
-  echo "✓ Including Cursor Agent tests"
-  PROVIDERS+=("cursor-agent -> auto")
-else
-  echo "⚠️  Skipping Cursor Agent tests ('cursor-agent' CLI tool required)"
-fi
-
-echo ""
-
-# Configure mode-specific settings
-if [ "$CODE_EXEC_MODE" = true ]; then
-  echo "Mode: code_execution (JS batching)"
-  BUILTINS="developer,code_execution"
-  # Match code_execution tool usage:
-  # - "execute | code_execution" or "get_function_details | code_execution" (fallback format)
-  # - "tool call | execute" or "tool calls | execute" (new format with tool_graph)
-  SUCCESS_PATTERN="(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)"
-  SUCCESS_MSG="code_execution tool called"
-  FAILURE_MSG="no code_execution tools called"
-else
-  echo "Mode: normal (direct tool calls)"
-  BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager"
-  SUCCESS_PATTERN="shell \| developer"
-  SUCCESS_MSG="developer tool called"
-  FAILURE_MSG="no developer tools called"
-fi
-echo ""
-
-is_allowed_failure() {
-  local provider="$1"
-  local model="$2"
-  local key="${provider}:${model}"
-  for allowed in "${ALLOWED_FAILURES[@]}"; do
-    if [ "$allowed" = "$key" ]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
-should_skip_provider() {
-  local provider="$1"
-  if [ -z "$SKIP_PROVIDERS" ]; then
-    return 1
-  fi
-  IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS"
-  for skip in "${SKIP_LIST[@]}"; do
-    # Trim whitespace
-    skip=$(echo "$skip" | xargs)
-    if [ "$skip" = "$provider" ]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
-is_agentic_provider() {
-  local provider="$1"
-  for agentic in "${AGENTIC_PROVIDERS[@]}"; do
-    if [ "$agentic" = "$provider" ]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
-# Create temp directory for results
-RESULTS_DIR=$(mktemp -d)
-trap "rm -rf $RESULTS_DIR" EXIT
-
-# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL)
-MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)}
-echo "Running tests with up to $MAX_PARALLEL parallel jobs"
-echo ""
-
-# Function to run a single test
 run_test() {
-  local provider="$1"
-  local model="$2"
-  local result_file="$3"
-  local output_file="$4"
-
+  local provider="$1" model="$2" result_file="$3" output_file="$4"
   local testdir=$(mktemp -d)
 
-  # Agentic providers use a file-read prompt with known content marker;
-  # regular providers use the shell prompt that produces tool-call logs.
   local prompt
   if is_agentic_provider "$provider"; then
     cp "$TEST_FILE" "$testdir/test-content.txt"
     prompt="read ./test-content.txt and output its contents exactly"
   else
-    echo "hello" > "$testdir/hello.txt"
-    prompt="Immediately use the shell tool to run 'ls'. Do not ask for confirmation."
+    echo "$TEST_CONTENT" > "$testdir/input.txt"
+    prompt="Use the text_editor view command to read ./input.txt, then output this file's contents in UPPERCASE. Do NOT use any other tool in Developer"
   fi
 
-  # Run the test and capture output
   (
     export GOOSE_PROVIDER="$provider"
     export GOOSE_MODEL="$model"
-    cd "$testdir" && "$SCRIPT_DIR/target/debug/goose" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
+    cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
   ) > "$output_file" 2>&1
 
-  # Check result: agentic providers return text containing the test content
-  # instead of producing tool-call log patterns
   if is_agentic_provider "$provider"; then
     if grep -qi "$TEST_CONTENT" "$output_file"; then
-      echo "success" > "$result_file"
+      echo "success|test content found by model" > "$result_file"
     else
-      echo "failure" > "$result_file"
+      echo "failure|test content not found by model" > "$result_file"
     fi
-  elif grep -qE "$SUCCESS_PATTERN" "$output_file"; then
-    echo "success" > "$result_file"
   else
-    echo "failure" > "$result_file"
-  fi
-
-  rm -rf "$testdir"
-}
-
-# Build list of all provider/model combinations
-JOBS=()
-job_index=0
-for provider_config in "${PROVIDERS[@]}"; do
-  PROVIDER="${provider_config%% -> *}"
-  MODELS_STR="${provider_config#* -> }"
-
-  # Skip provider if it's in SKIP_PROVIDERS
-  if should_skip_provider "$PROVIDER"; then
-    echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)"
-    continue
-  fi
-
-  # Agentic providers don't use goose's code_execution system
-  if [ "$CODE_EXEC_MODE" = true ] && is_agentic_provider "$PROVIDER"; then
-    echo "⊘ Skipping agentic provider in code_exec mode: ${PROVIDER}"
-    continue
-  fi
-
-  IFS='|' read -ra MODELS <<< "$MODELS_STR"
-  for MODEL in "${MODELS[@]}"; do
-    JOBS+=("$PROVIDER|$MODEL|$job_index")
-    ((job_index++))
-  done
-done
-
-total_jobs=${#JOBS[@]}
-echo "Starting $total_jobs tests..."
-echo ""
-
-# Run first test sequentially if any jobs exist
-if [ $total_jobs -gt 0 ]; then
-  echo "Running first test sequentially..."
-  first_job="${JOBS[0]}"
-  IFS='|' read -r provider model idx <<< "$first_job"
-
-  result_file="$RESULTS_DIR/result_$idx"
-  output_file="$RESULTS_DIR/output_$idx"
-  meta_file="$RESULTS_DIR/meta_$idx"
-  echo "$provider|$model" > "$meta_file"
-
-  # Run first test and wait for it to complete
-  run_test "$provider" "$model" "$result_file" "$output_file"
-  echo "First test completed."
-  echo ""
-fi
-
-# Run remaining tests in parallel
-if [ $total_jobs -gt 1 ]; then
-  echo "Running remaining tests in parallel..."
-  running_jobs=0
-  for ((i=1; i<$total_jobs; i++)); do
-    job="${JOBS[$i]}"
-    IFS='|' read -r provider model idx <<< "$job"
-
-    result_file="$RESULTS_DIR/result_$idx"
-    output_file="$RESULTS_DIR/output_$idx"
-    meta_file="$RESULTS_DIR/meta_$idx"
-    echo "$provider|$model" > "$meta_file"
-
-    # Run test in background
-    run_test "$provider" "$model" "$result_file" "$output_file" &
-    ((running_jobs++))
-
-    # Wait if we've hit the parallel limit
-    if [ $running_jobs -ge $MAX_PARALLEL ]; then
-      wait -n 2>/dev/null || wait
-      ((running_jobs--))
-    fi
-  done
-
-  # Wait for all remaining jobs
-  wait
-fi
-
-echo ""
-echo "=== Test Results ==="
-echo ""
-
-# Collect results
-RESULTS=()
-HARD_FAILURES=()
-
-for job in "${JOBS[@]}"; do
-  IFS='|' read -r provider model idx <<< "$job"
-
-  result_file="$RESULTS_DIR/result_$idx"
-  output_file="$RESULTS_DIR/output_$idx"
-
-  echo "Provider: $provider"
-  echo "Model: $model"
-  echo ""
-  cat "$output_file"
-  echo ""
-
-  if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then
-    echo "✓ SUCCESS: Test passed - $SUCCESS_MSG"
-    RESULTS+=("✓ ${provider}: ${model}")
-  else
-    if is_allowed_failure "$provider" "$model"; then
-      echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG"
-      RESULTS+=("⚠ ${provider}: ${model} (flaky)")
+    if ! grep -q "text_editor | developer" "$output_file"; then
+      echo "failure|model did not use text_editor tool" > "$result_file"
+    elif ! grep -q "TEST-CONTENT-ABC123" "$output_file"; then
+      echo "failure|model did not return uppercased file content" > "$result_file"
     else
-      echo "✗ FAILED: Test failed - $FAILURE_MSG"
-      RESULTS+=("✗ ${provider}: ${model}")
-      HARD_FAILURES+=("${provider}: ${model}")
+      echo "success|model read and uppercased file content" > "$result_file"
     fi
   fi
-  echo "---"
-done
 
-echo ""
-echo "=== Test Summary ==="
-for result in "${RESULTS[@]}"; do
-  echo "$result"
-done
+  rm -rf "$testdir"
+}
 
-if [ ${#HARD_FAILURES[@]} -gt 0 ]; then
-  echo ""
-  echo "Hard failures (${#HARD_FAILURES[@]}):"
-  for failure in "${HARD_FAILURES[@]}"; do
-    echo "  - $failure"
-  done
-  echo ""
-  echo "Some tests failed!"
-  exit 1
-else
-  if echo "${RESULTS[@]}" | grep -q "⚠"; then
-    echo ""
-    echo "All required tests passed! (some flaky tests failed but are allowed)"
-  else
-    echo ""
-    echo "All tests passed!"
-  fi
-fi
+build_test_cases
+run_test_cases run_test
+report_results
diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh
new file mode 100755
index 00000000000..d0737c37cef
--- /dev/null
+++ b/scripts/test_providers_code_exec.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Provider smoke tests - code execution mode (JS batching)
+
+LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
+source "$LIB_DIR/test_providers_lib.sh"
+
+echo "Mode: code_execution (JS batching)"
+echo ""
+
+# --- Setup ---
+
+GOOSE_BIN=$(build_goose)
+BUILTINS="developer,code_execution"
+
+# --- Test case ---
+
+run_test() {
+  local provider="$1" model="$2" result_file="$3" output_file="$4"
+  local testdir=$(mktemp -d)
+
+  echo "hello" > "$testdir/hello.txt"
+  local prompt="Run 'ls' to list files in the current directory."
+
+  # Run goose
+  (
+    export GOOSE_PROVIDER="$provider"
+    export GOOSE_MODEL="$model"
+    cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
+  ) > "$output_file" 2>&1
+
+  # Verify: code_execution tool must be called
+  # Matches: "execute | code_execution", "get_function_details | code_execution",
+  #           "tool call | execute", "tool calls | execute"
+  if grep -qE "(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)" "$output_file"; then
+    echo "success|code_execution tool called" > "$result_file"
+  else
+    echo "failure|no code_execution tool calls found" > "$result_file"
+  fi
+
+  rm -rf "$testdir"
+}
+
+build_test_cases --skip-agentic
+run_test_cases run_test
+report_results
diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh
new file mode 100755
index 00000000000..b56f13a8998
--- /dev/null
+++ b/scripts/test_providers_lib.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+
+PROVIDER_CONFIG="
+openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b
+xai -> grok-3
+openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5
+anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101
+google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview
+tetrate -> claude-sonnet-4-20250514
+databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o
+azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}
+aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0
+gcp_vertex_ai -> gemini-2.5-pro
+snowflake -> claude-sonnet-4-5
+venice -> llama-3.3-70b
+litellm -> gpt-4o-mini
+sagemaker_tgi -> sagemaker-tgi-endpoint
+github_copilot -> gpt-4.1
+chatgpt_codex -> gpt-5.1-codex
+claude-code -> claude-sonnet-4-20250514
+codex -> gpt-5.2-codex
+gemini-cli -> gemini-2.5-pro
+cursor-agent -> auto
+ollama -> qwen3
+"
+
+# Flaky models allowed to fail without blocking PRs.
+ALLOWED_FAILURES=(
+  "google:gemini-2.5-flash"
+  "google:gemini-3-pro-preview"
+  "openrouter:nvidia/nemotron-3-nano-30b-a3b"
+  "openai:gpt-3.5-turbo"
+)
+
+AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent")
+
+if [ -f .env ]; then
+  export $(grep -v '^#' .env | xargs)
+fi
+
+build_goose() {
+  if [ -z "$SKIP_BUILD" ]; then
+    echo "Building goose..." >&2
+    cargo build --bin goose >&2
+    echo "" >&2
+  else
+    echo "Skipping build (SKIP_BUILD is set)..." >&2
+    echo "" >&2
+  fi
+
+  echo "$(pwd)/target/debug/goose"
+}
+
+has_env() { [ -n "${!1}" ]; }
+has_cmd() { command -v "$1" &>/dev/null; }
+has_file() { [ -f "$1" ]; }
+
+is_provider_available() {
+  case "$1" in
+    openrouter)      has_env OPENROUTER_API_KEY ;;
+    xai)             has_env XAI_API_KEY ;;
+    openai)          has_env OPENAI_API_KEY ;;
+    anthropic)       has_env ANTHROPIC_API_KEY ;;
+    google)          has_env GOOGLE_API_KEY ;;
+    tetrate)         has_env TETRATE_API_KEY ;;
+    databricks)      has_env DATABRICKS_HOST && has_env DATABRICKS_TOKEN ;;
+    azure_openai)    has_env AZURE_OPENAI_ENDPOINT && has_env AZURE_OPENAI_DEPLOYMENT_NAME ;;
+    aws_bedrock)     has_env AWS_REGION && { has_env AWS_PROFILE || has_env AWS_ACCESS_KEY_ID; } ;;
+    gcp_vertex_ai)   has_env GCP_PROJECT_ID ;;
+    snowflake)       has_env SNOWFLAKE_HOST && has_env SNOWFLAKE_TOKEN ;;
+    venice)          has_env VENICE_API_KEY ;;
+    litellm)         has_env LITELLM_API_KEY ;;
+    sagemaker_tgi)   has_env SAGEMAKER_ENDPOINT_NAME && has_env AWS_REGION ;;
+    github_copilot)  has_env GITHUB_COPILOT_TOKEN || has_file "$HOME/.config/goose/github_copilot_token.json" ;;
+    chatgpt_codex)   has_env CHATGPT_CODEX_TOKEN || has_file "$HOME/.config/goose/chatgpt_codex_token.json" ;;
+    ollama)          has_env OLLAMA_HOST || has_cmd ollama ;;
+    claude-code)     has_cmd claude ;;
+    codex)           has_cmd codex ;;
+    gemini-cli)      has_cmd gemini ;;
+    cursor-agent)    has_cmd cursor-agent ;;
+    *)               return 0 ;;
+  esac
+}
+
+is_allowed_failure() {
+  local key="${1}:${2}"
+  for allowed in "${ALLOWED_FAILURES[@]}"; do
+    [ "$allowed" = "$key" ] && return 0
+  done
+  return 1
+}
+
+should_skip_provider() {
+  [ -z "$SKIP_PROVIDERS" ] && return 1
+  IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS"
+  for skip in "${SKIP_LIST[@]}"; do
+    skip=$(echo "$skip" | xargs)
+    [ "$skip" = "$1" ] && return 0
+  done
+  return 1
+}
+
+is_agentic_provider() {
+  for agentic in "${AGENTIC_PROVIDERS[@]}"; do
+    [ "$agentic" = "$1" ] && return 0
+  done
+  return 1
+}
+
+# build_test_cases [--skip-agentic]
+build_test_cases() {
+  local skip_agentic=false
+  [ "$1" = "--skip-agentic" ] && skip_agentic=true
+
+  local providers=()
+  while IFS= read -r line; do
+    [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue
+    local provider="${line%% -> *}"
+    if is_provider_available "$provider"; then
+      providers+=("$line")
+      echo "✓ Including $provider"
+    else
+      echo "⚠️  Skipping $provider (prerequisites not met)"
+    fi
+  done <<< "$PROVIDER_CONFIG"
+  echo ""
+
+  TEST_CASES=()
+  local job_index=0
+  for provider_config in "${providers[@]}"; do
+    local provider="${provider_config%% -> *}"
+    local models_str="${provider_config#* -> }"
+
+    if should_skip_provider "$provider"; then
+      echo "⊘ Skipping provider: ${provider} (SKIP_PROVIDERS)"
+      continue
+    fi
+
+    if [ "$skip_agentic" = true ] && is_agentic_provider "$provider"; then
+      echo "⊘ Skipping agentic provider: ${provider}"
+      continue
+    fi
+
+    IFS='|' read -ra models <<< "$models_str"
+    for model in "${models[@]}"; do
+      TEST_CASES+=("$provider|$model|$job_index")
+      ((job_index++))
+    done
+  done
+}
+
+# run_test_cases <test_fn>
+run_test_cases() {
+  local test_fn="$1"
+
+  RESULTS_DIR=$(mktemp -d)
+  trap 'if [ -n "${RESULTS_DIR:-}" ]; then rm -rf -- "$RESULTS_DIR"; fi; if [ -n "${CLEANUP_DIR:-}" ]; then rm -rf -- "$CLEANUP_DIR"; fi' EXIT
+  MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)}
+  echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)"
+  echo ""
+
+  local running=0
+  for ((i=0; i<${#TEST_CASES[@]}; i++)); do
+    IFS='|' read -r provider model idx <<< "${TEST_CASES[$i]}"
+
+    if [ $i -eq 0 ]; then
+      # First test runs sequentially to catch early failures
+      "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx"
+    else
+      "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" &
+      ((running++))
+      if [ $running -ge $MAX_PARALLEL ]; then
+        wait -n 2>/dev/null || wait
+        ((running--))
+      fi
+    fi
+  done
+  wait
+}
+
+report_results() {
+  echo ""
+  echo "=== Test Results ==="
+  echo ""
+
+  RESULTS=()
+  HARD_FAILURES=()
+
+  for job in "${TEST_CASES[@]}"; do
+    IFS='|' read -r provider model idx <<< "$job"
+
+    echo "Provider: $provider"
+    echo "Model: $model"
+    echo ""
+    cat "$RESULTS_DIR/output_$idx"
+    echo ""
+
+    local result_line=""
+    [ -f "$RESULTS_DIR/result_$idx" ] && result_line=$(cat "$RESULTS_DIR/result_$idx")
+    local status="${result_line%%|*}"
+    local msg="${result_line#*|}"
+
+    if [ "$status" = "success" ]; then
+      echo "✓ SUCCESS: $msg"
+      RESULTS+=("✓ ${provider}: ${model}")
+    else
+      if is_allowed_failure "$provider" "$model"; then
+        echo "⚠ FLAKY: $msg"
+        RESULTS+=("⚠ ${provider}: ${model} (flaky)")
+      else
+        echo "✗ FAILED: $msg"
+        RESULTS+=("✗ ${provider}: ${model}")
+        HARD_FAILURES+=("${provider}: ${model}")
+      fi
+    fi
+    echo "---"
+  done
+
+  echo ""
+  echo "=== Test Summary ==="
+  for result in "${RESULTS[@]}"; do
+    echo "$result"
+  done
+
+  if [ ${#HARD_FAILURES[@]} -gt 0 ]; then
+    echo ""
+    echo "Hard failures (${#HARD_FAILURES[@]}):"
+    for failure in "${HARD_FAILURES[@]}"; do
+      echo "  - $failure"
+    done
+    echo ""
+    echo "Some tests failed!"
+    exit 1
+  else
+    if echo "${RESULTS[@]}" | grep -q "⚠"; then
+      echo ""
+      echo "All required tests passed! (some flaky tests failed but are allowed)"
+    else
+      echo ""
+      echo "All tests passed!"
+    fi
+  fi
+}