|
| 1 | +""" |
| 2 | +E2E tests to ensure extract returns validate into snake_case Pydantic schemas |
| 3 | +for both LOCAL and BROWSERBASE environments, covering API responses that may |
| 4 | +use camelCase keys. |
| 5 | +""" |
| 6 | + |
| 7 | +import os |
| 8 | +import pytest |
| 9 | +import pytest_asyncio |
| 10 | +from urllib.parse import urlparse |
| 11 | +from pydantic import BaseModel, Field, HttpUrl |
| 12 | + |
| 13 | +from stagehand import Stagehand, StagehandConfig |
| 14 | +from stagehand.schemas import ExtractOptions |
| 15 | + |
| 16 | + |
| 17 | +class Company(BaseModel): |
| 18 | + company_name: str = Field(..., description="The name of the company") |
| 19 | + company_url: HttpUrl = Field(..., description="The URL of the company website or relevant page") |
| 20 | + |
| 21 | + |
| 22 | +class Companies(BaseModel): |
| 23 | + companies: list[Company] = Field(..., description="List of companies extracted from the page") |
| 24 | + |
| 25 | + |
| 26 | +@pytest.fixture(scope="class") |
| 27 | +def local_config(): |
| 28 | + return StagehandConfig( |
| 29 | + env="LOCAL", |
| 30 | + model_name="gpt-4o-mini", |
| 31 | + headless=True, |
| 32 | + verbose=1, |
| 33 | + dom_settle_timeout_ms=2000, |
| 34 | + model_client_options={ |
| 35 | + "apiKey": os.getenv("MODEL_API_KEY") or os.getenv("OPENAI_API_KEY") |
| 36 | + }, |
| 37 | + ) |
| 38 | + |
| 39 | + |
| 40 | +@pytest.fixture(scope="class") |
| 41 | +def browserbase_config(): |
| 42 | + return StagehandConfig( |
| 43 | + env="BROWSERBASE", |
| 44 | + api_key=os.getenv("BROWSERBASE_API_KEY"), |
| 45 | + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), |
| 46 | + model_name="gpt-4o", |
| 47 | + headless=False, |
| 48 | + verbose=2, |
| 49 | + dom_settle_timeout_ms=3000, |
| 50 | + model_client_options={ |
| 51 | + "apiKey": os.getenv("MODEL_API_KEY") or os.getenv("OPENAI_API_KEY") |
| 52 | + }, |
| 53 | + ) |
| 54 | + |
| 55 | + |
| 56 | +@pytest_asyncio.fixture |
| 57 | +async def local_stagehand(local_config): |
| 58 | + stagehand = Stagehand(config=local_config) |
| 59 | + await stagehand.init() |
| 60 | + yield stagehand |
| 61 | + await stagehand.close() |
| 62 | + |
| 63 | + |
| 64 | +@pytest_asyncio.fixture |
| 65 | +async def browserbase_stagehand(browserbase_config): |
| 66 | + if not (os.getenv("BROWSERBASE_API_KEY") and os.getenv("BROWSERBASE_PROJECT_ID")): |
| 67 | + pytest.skip("Browserbase credentials not available") |
| 68 | + stagehand = Stagehand(config=browserbase_config) |
| 69 | + await stagehand.init() |
| 70 | + yield stagehand |
| 71 | + await stagehand.close() |
| 72 | + |
| 73 | + |
| 74 | +@pytest.mark.asyncio |
| 75 | +@pytest.mark.local |
| 76 | +async def test_extract_companies_casing_local(local_stagehand): |
| 77 | + stagehand = local_stagehand |
| 78 | + # Use stable eval site for consistency |
| 79 | + await stagehand.page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/") |
| 80 | + |
| 81 | + extract_options = ExtractOptions( |
| 82 | + instruction="Extract the names and URLs of up to 5 companies in batch 3", |
| 83 | + schema_definition=Companies, |
| 84 | + ) |
| 85 | + |
| 86 | + result = await stagehand.page.extract(extract_options) |
| 87 | + |
| 88 | + # Should be validated into our snake_case Pydantic model |
| 89 | + assert isinstance(result, Companies) |
| 90 | + assert 0 < len(result.companies) <= 5 |
| 91 | + for c in result.companies: |
| 92 | + assert isinstance(c.company_name, str) and c.company_name |
| 93 | + # Avoid isinstance checks with Pydantic's Annotated types; validate via parsing |
| 94 | + parsed = urlparse(str(c.company_url)) |
| 95 | + assert parsed.scheme in ("http", "https") and bool(parsed.netloc) |
| 96 | + |
| 97 | + |
| 98 | +@pytest.mark.asyncio |
| 99 | +@pytest.mark.api |
| 100 | +@pytest.mark.skipif( |
| 101 | + not (os.getenv("BROWSERBASE_API_KEY") and os.getenv("BROWSERBASE_PROJECT_ID")), |
| 102 | + reason="Browserbase credentials not available", |
| 103 | +) |
| 104 | +async def test_extract_companies_casing_browserbase(browserbase_stagehand): |
| 105 | + stagehand = browserbase_stagehand |
| 106 | + # Use stable eval site for consistency |
| 107 | + await stagehand.page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/") |
| 108 | + |
| 109 | + extract_options = ExtractOptions( |
| 110 | + instruction="Extract the names and URLs of up to 5 companies in batch 3", |
| 111 | + schema_definition=Companies, |
| 112 | + ) |
| 113 | + |
| 114 | + result = await stagehand.page.extract(extract_options) |
| 115 | + |
| 116 | + # Should be validated into our snake_case Pydantic model even if API returns camelCase |
| 117 | + assert isinstance(result, Companies) |
| 118 | + assert 0 < len(result.companies) <= 5 |
| 119 | + for c in result.companies: |
| 120 | + assert isinstance(c.company_name, str) and c.company_name |
| 121 | + parsed = urlparse(str(c.company_url)) |
| 122 | + assert parsed.scheme in ("http", "https") and bool(parsed.netloc) |
| 123 | + |
| 124 | + |
0 commit comments