1010"""
1111
1212import os
13- from typing import Any , Dict
13+ import json
14+ from typing import Any , Dict , Optional , List , Union
1415
1516import httpx
1617from fastmcp import FastMCP
@@ -33,7 +34,7 @@ def __init__(self, api_key: str):
3334 "SGAI-APIKEY" : api_key ,
3435 "Content-Type" : "application/json"
3536 }
36- self .client = httpx .Client (timeout = 60.0 )
37+ self .client = httpx .Client (timeout = httpx . Timeout ( 120.0 ) )
3738
3839 def markdownify (self , website_url : str ) -> Dict [str , Any ]:
3940 """
@@ -126,6 +127,85 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126127
127128 return response .json ()
128129
130+ def scrape (self , website_url : str , render_heavy_js : Optional [bool ] = None ) -> Dict [str , Any ]:
131+ """
132+ Basic scrape endpoint to fetch page content.
133+
134+ Args:
135+ website_url: URL to scrape
136+ render_heavy_js: Whether to render heavy JS (optional)
137+
138+ Returns:
139+ Dictionary containing the scraped result
140+ """
141+ url = f"{ self .BASE_URL } /scrape"
142+ payload : Dict [str , Any ] = {"website_url" : website_url }
143+ if render_heavy_js is not None :
144+ payload ["render_heavy_js" ] = render_heavy_js
145+
146+ response = self .client .post (url , headers = self .headers , json = payload )
147+ response .raise_for_status ()
148+ return response .json ()
149+
150+ def sitemap (self , website_url : str ) -> Dict [str , Any ]:
151+ """
152+ Extract sitemap for a given website.
153+
154+ Args:
155+ website_url: Base website URL
156+
157+ Returns:
158+ Dictionary containing sitemap URLs/structure
159+ """
160+ url = f"{ self .BASE_URL } /sitemap"
161+ payload : Dict [str , Any ] = {"website_url" : website_url }
162+
163+ response = self .client .post (url , headers = self .headers , json = payload )
164+ response .raise_for_status ()
165+ return response .json ()
166+
167+ def agentic_scrapper (
168+ self ,
169+ url : str ,
170+ user_prompt : Optional [str ] = None ,
171+ output_schema : Optional [Dict [str , Any ]] = None ,
172+ steps : Optional [List [str ]] = None ,
173+ ai_extraction : Optional [bool ] = None ,
174+ persistent_session : Optional [bool ] = None ,
175+ timeout_seconds : Optional [float ] = None ,
176+ ) -> Dict [str , Any ]:
177+ """
178+ Run the Agentic Scraper workflow (no live session/browser interaction).
179+
180+ Args:
181+ url: Target website URL
182+ user_prompt: Instructions for what to do/extract (optional)
183+ output_schema: Desired structured output schema (optional)
184+ steps: High-level steps/instructions for the agent (optional)
185+ ai_extraction: Whether to enable AI extraction mode (optional)
186+ persistent_session: Whether to keep session alive between steps (optional)
187+ timeout_seconds: Per-request timeout override in seconds (optional)
188+ """
189+ endpoint = f"{ self .BASE_URL } /agentic-scrapper"
190+ payload : Dict [str , Any ] = {"url" : url }
191+ if user_prompt is not None :
192+ payload ["user_prompt" ] = user_prompt
193+ if output_schema is not None :
194+ payload ["output_schema" ] = output_schema
195+ if steps is not None :
196+ payload ["steps" ] = steps
197+ if ai_extraction is not None :
198+ payload ["ai_extraction" ] = ai_extraction
199+ if persistent_session is not None :
200+ payload ["persistent_session" ] = persistent_session
201+
202+ if timeout_seconds is not None :
203+ response = self .client .post (endpoint , headers = self .headers , json = payload , timeout = timeout_seconds )
204+ else :
205+ response = self .client .post (endpoint , headers = self .headers , json = payload )
206+ response .raise_for_status ()
207+ return response .json ()
208+
129209 def smartcrawler_initiate (
130210 self ,
131211 url : str ,
@@ -371,6 +451,110 @@ def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]:
371451 return {"error" : str (e )}
372452
373453
454+ # Add tool for basic scrape
455+ @mcp .tool ()
456+ def scrape (website_url : str , render_heavy_js : Optional [bool ] = None ) -> Dict [str , Any ]:
457+ """
458+ Fetch page content for a URL.
459+
460+ Args:
461+ website_url: URL to scrape
462+ render_heavy_js: Whether to render heavy JS (optional)
463+ """
464+ if scrapegraph_client is None :
465+ return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
466+
467+ try :
468+ return scrapegraph_client .scrape (website_url = website_url , render_heavy_js = render_heavy_js )
469+ except httpx .HTTPError as http_err :
470+ return {"error" : str (http_err )}
471+ except ValueError as val_err :
472+ return {"error" : str (val_err )}
473+
474+
475+ # Add tool for sitemap extraction
476+ @mcp .tool ()
477+ def sitemap (website_url : str ) -> Dict [str , Any ]:
478+ """
479+ Extract sitemap for a website.
480+
481+ Args:
482+ website_url: Base website URL
483+ """
484+ if scrapegraph_client is None :
485+ return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
486+
487+ try :
488+ return scrapegraph_client .sitemap (website_url = website_url )
489+ except httpx .HTTPError as http_err :
490+ return {"error" : str (http_err )}
491+ except ValueError as val_err :
492+ return {"error" : str (val_err )}
493+
494+
495+ # Add tool for Agentic Scraper (no live session/browser interaction)
496+ @mcp .tool ()
497+ def agentic_scrapper (
498+ url : str ,
499+ user_prompt : Optional [str ] = None ,
500+ output_schema : Optional [Union [str , Dict [str , Any ]]] = None ,
501+ steps : Optional [Union [str , List [str ]]] = None ,
502+ ai_extraction : Optional [bool ] = None ,
503+ persistent_session : Optional [bool ] = None ,
504+ timeout_seconds : Optional [float ] = None ,
505+ ) -> Dict [str , Any ]:
506+ """
507+ Run the Agentic Scraper workflow. Accepts flexible input forms for steps and schema.
508+ """
509+ if scrapegraph_client is None :
510+ return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
511+
512+ # Normalize inputs
513+ normalized_steps : Optional [List [str ]] = None
514+ if isinstance (steps , list ):
515+ normalized_steps = steps
516+ elif isinstance (steps , str ):
517+ parsed_steps : Optional [Any ] = None
518+ try :
519+ parsed_steps = json .loads (steps )
520+ except json .JSONDecodeError :
521+ parsed_steps = None
522+ if isinstance (parsed_steps , list ):
523+ normalized_steps = parsed_steps
524+ else :
525+ normalized_steps = [steps ]
526+
527+ normalized_schema : Optional [Dict [str , Any ]] = None
528+ if isinstance (output_schema , dict ):
529+ normalized_schema = output_schema
530+ elif isinstance (output_schema , str ):
531+ try :
532+ parsed_schema = json .loads (output_schema )
533+ if isinstance (parsed_schema , dict ):
534+ normalized_schema = parsed_schema
535+ else :
536+ return {"error" : "output_schema must be a JSON object" }
537+ except json .JSONDecodeError as e :
538+ return {"error" : f"Invalid JSON for output_schema: { str (e )} " }
539+
540+ try :
541+ return scrapegraph_client .agentic_scrapper (
542+ url = url ,
543+ user_prompt = user_prompt ,
544+ output_schema = normalized_schema ,
545+ steps = normalized_steps ,
546+ ai_extraction = ai_extraction ,
547+ persistent_session = persistent_session ,
548+ timeout_seconds = timeout_seconds ,
549+ )
550+ except httpx .TimeoutException as timeout_err :
551+ return {"error" : f"Request timed out: { str (timeout_err )} " }
552+ except httpx .HTTPError as http_err :
553+ return {"error" : str (http_err )}
554+ except ValueError as val_err :
555+ return {"error" : str (val_err )}
556+
557+
374558def main () -> None :
375559 """Run the ScapeGraph MCP server."""
376560 print ("Starting ScapeGraph MCP server!" )
0 commit comments