Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ FIRECRAWL_API_KEY=<your Firecrawl api key here>
SCRAPINGBEE_API_KEY=<your ScrapingBee api key here>
SCRAPERAPI_API_KEY=<your ScraperAPI api key here>
TAVILY_API_KEY=<your Tavily api key here>
ZYTE_API_KEY=<your ZYTE api key>
ZYTE_API_KEY=<your ZYTE api key>
TERACRAWL_API_URL=<your Teracrawl API URL here>
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.13
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Below are evaluation results across different engines.

| Engine | Coverage (Success Rate) (%) | Quality (F1) |
|-----------------|-----------------------------|--------------|
| Teracrawl | 84.2 | 0.62 |
| Firecrawl | 80.9 | 0.68 |
| Exa | 76.3 | 0.53 |
| Tavily | 67.6 | 0.50 |
Expand Down
3 changes: 3 additions & 0 deletions engines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class ScrapeResult(TypedDict, total=False):
# Tavily
"tavily_api",

# Teracrawl
"teracrawl_api",

# Zyte
"zyte_api",
]
Expand Down
135 changes: 135 additions & 0 deletions engines/teracrawl_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os
import sys
from pathlib import Path
from datetime import datetime
import asyncio

# Add project root and src to Python path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))

from dotenv import load_dotenv
from .base import Scraper, ScrapeResult

try:
import httpx
except ImportError:
httpx = None # type: ignore

load_dotenv()


class TeracrawlAPIScraper(Scraper):
"""Scrapes web pages using the Teracrawl API (single page scrape endpoint)."""

def __init__(self):
if httpx is None:
raise ImportError("httpx is required for TeracrawlAPIScraper. Install with: pip install httpx")

# Default to localhost:8085 as per Teracrawl documentation
self.api_url = os.getenv("TERACRAWL_API_URL", "http://localhost:8085")
self.timeout = float(os.getenv("TERACRAWL_TIMEOUT", "600"))

def check_environment(self) -> bool:
"""Check if the Teracrawl API is accessible."""
if httpx is None:
return False
try:
with httpx.Client(timeout=5.0) as client:
response = client.get(f"{self.api_url}/health")
return response.status_code == 200
except Exception:
return False

async def scrape(self, url: str, run_id: str) -> ScrapeResult:
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.post(
f"{self.api_url}/scrape",
json={"url": url},
headers={"Content-Type": "application/json"},
)

if response.status_code != 200:
return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=response.status_code,
error=f"HTTP error: {response.status_code}",
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)

data = response.json()

# Check status field from Teracrawl response
status = data.get("status", "")
if status == "error":
return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=500,
error=data.get("error", "Unknown error from Teracrawl"),
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)

# Extract markdown content
markdown = data.get("markdown", "")
content_size = len(markdown.encode("utf-8")) if markdown else 0

return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=200,
error=None,
content_size=content_size,
format="markdown",
created_at=datetime.now().isoformat(),
content=markdown or None,
)

except asyncio.TimeoutError:
return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=408,
error="Timeout error",
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)
except httpx.TimeoutException:
return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=408,
error="Timeout error",
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)
except Exception as e:
return ScrapeResult(
run_id=run_id,
scraper="teracrawl_api",
url=url,
status_code=500,
error=f"{type(e).__name__}: {str(e)}",
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def main():
print("Hello from scrape-evals!")


if __name__ == "__main__":
main()
23 changes: 23 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[project]
name = "scrape-evals"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"apify-client>=2.3.0",
"crawl4ai>=0.7.7",
"exa-py>=2.0.1",
"firecrawl>=4.10.0",
"nbformat>=5.10.4",
"pandas>=2.3.3",
"playwright>=1.56.0",
"plotly>=6.5.0",
"python-dotenv>=1.2.1",
"requests>=2.32.5",
"scrapingbee>=2.0.2",
"scrapy>=2.13.4",
"selenium>=4.38.0",
"tavily-python>=0.7.13",
"typer>=0.20.0",
]
6 changes: 6 additions & 0 deletions runs/results/teracrawl_api_quality.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"success_rate": 0.842,
"avg_recall": 0.625746455360982,
"avg_precision": 0.6414422764926457,
"avg_f1": 0.6270447297905766
}
Loading