firecrawl · Brenden2008 · Dec 2, 2025 · Dec 3, 2025
diff --git a/.env.example b/.env.example
@@ -5,4 +5,5 @@ FIRECRAWL_API_KEY=<your Firecrawl api key here>
 SCRAPINGBEE_API_KEY=<your ScrapingBee api key here>
 SCRAPERAPI_API_KEY=<your ScraperAPI api key here>
 TAVILY_API_KEY=<your Tavily api key here>
-ZYTE_API_KEY=<your ZYTE api key>
+ZYTE_API_KEY=<your ZYTE api key>
+TERACRAWL_API_URL=<your Teracrawl API URL here>
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Below are evaluation results across different engines.
 
 | Engine	        | Coverage (Success Rate) (%)	| Quality (F1) |
 |-----------------|-----------------------------|--------------|
+| Teracrawl	      | 84.2	                      | 0.62         |
 | Firecrawl	      | 80.9	                      | 0.68         |
 | Exa	            | 76.3	                      | 0.53         |
 | Tavily	        | 67.6	                      | 0.50         |

diff --git a/engines/base.py b/engines/base.py
@@ -38,6 +38,9 @@ class ScrapeResult(TypedDict, total=False):
         # Tavily
         "tavily_api",
 
+        # Teracrawl
+        "teracrawl_api",
+
         # Zyte
         "zyte_api",
     ]

diff --git a/engines/teracrawl_api.py b/engines/teracrawl_api.py
@@ -0,0 +1,135 @@
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+import asyncio
+
+# Add project root and src to Python path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+sys.path.insert(0, str(project_root / "src"))
+
+from dotenv import load_dotenv
+from .base import Scraper, ScrapeResult
+
+try:
+    import httpx
+except ImportError:
+    httpx = None  # type: ignore
+
+load_dotenv()
+
+
+class TeracrawlAPIScraper(Scraper):
+    """Scrapes web pages using the Teracrawl API (single page scrape endpoint)."""
+
+    def __init__(self):
+        if httpx is None:
+            raise ImportError("httpx is required for TeracrawlAPIScraper. Install with: pip install httpx")
+
+        # Default to localhost:8085 as per Teracrawl documentation
+        self.api_url = os.getenv("TERACRAWL_API_URL", "http://localhost:8085")
+        self.timeout = float(os.getenv("TERACRAWL_TIMEOUT", "600"))
+
+    def check_environment(self) -> bool:
+        """Check if the Teracrawl API is accessible."""
+        if httpx is None:
+            return False
+        try:
+            with httpx.Client(timeout=5.0) as client:
+                response = client.get(f"{self.api_url}/health")
+                return response.status_code == 200
+        except Exception:
+            return False
+
+    async def scrape(self, url: str, run_id: str) -> ScrapeResult:
+        try:
+            async with httpx.AsyncClient(timeout=self.timeout) as client:
+                response = await client.post(
+                    f"{self.api_url}/scrape",
+                    json={"url": url},
+                    headers={"Content-Type": "application/json"},
+                )
+
+                if response.status_code != 200:
+                    return ScrapeResult(
+                        run_id=run_id,
+                        scraper="teracrawl_api",
+                        url=url,
+                        status_code=response.status_code,
+                        error=f"HTTP error: {response.status_code}",
+                        content_size=0,
+                        format="markdown",
+                        created_at=datetime.now().isoformat(),
+                        content=None,
+                    )
+
+                data = response.json()
+
+                # Check status field from Teracrawl response
+                status = data.get("status", "")
+                if status == "error":
+                    return ScrapeResult(
+                        run_id=run_id,
+                        scraper="teracrawl_api",
+                        url=url,
+                        status_code=500,
+                        error=data.get("error", "Unknown error from Teracrawl"),
+                        content_size=0,
+                        format="markdown",
+                        created_at=datetime.now().isoformat(),
+                        content=None,
+                    )
+
+                # Extract markdown content
+                markdown = data.get("markdown", "")
+                content_size = len(markdown.encode("utf-8")) if markdown else 0
+
+                return ScrapeResult(
+                    run_id=run_id,
+                    scraper="teracrawl_api",
+                    url=url,
+                    status_code=200,
+                    error=None,
+                    content_size=content_size,
+                    format="markdown",
+                    created_at=datetime.now().isoformat(),
+                    content=markdown or None,
+                )
+
+        except asyncio.TimeoutError:
+            return ScrapeResult(
+                run_id=run_id,
+                scraper="teracrawl_api",
+                url=url,
+                status_code=408,
+                error="Timeout error",
+                content_size=0,
+                format="markdown",
+                created_at=datetime.now().isoformat(),
+                content=None,
+            )
+        except httpx.TimeoutException:
+            return ScrapeResult(
+                run_id=run_id,
+                scraper="teracrawl_api",
+                url=url,
+                status_code=408,
+                error="Timeout error",
+                content_size=0,
+                format="markdown",
+                created_at=datetime.now().isoformat(),
+                content=None,
+            )
+        except Exception as e:
+            return ScrapeResult(
+                run_id=run_id,
+                scraper="teracrawl_api",
+                url=url,
+                status_code=500,
+                error=f"{type(e).__name__}: {str(e)}",
+                content_size=0,
+                format="markdown",
+                created_at=datetime.now().isoformat(),
+                content=None,
+            )
diff --git a/main.py b/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from scrape-evals!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "scrape-evals"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "apify-client>=2.3.0",
+    "crawl4ai>=0.7.7",
+    "exa-py>=2.0.1",
+    "firecrawl>=4.10.0",
+    "nbformat>=5.10.4",
+    "pandas>=2.3.3",
+    "playwright>=1.56.0",
+    "plotly>=6.5.0",
+    "python-dotenv>=1.2.1",
+    "requests>=2.32.5",
+    "scrapingbee>=2.0.2",
+    "scrapy>=2.13.4",
+    "selenium>=4.38.0",
+    "tavily-python>=0.7.13",
+    "typer>=0.20.0",
+]
diff --git a/runs/results/teracrawl_api_quality.json b/runs/results/teracrawl_api_quality.json
@@ -0,0 +1,6 @@
+{
+  "success_rate": 0.842,
+  "avg_recall": 0.625746455360982,
+  "avg_precision": 0.6414422764926457,
+  "avg_f1": 0.6270447297905766
+}