55- markdownify: Convert any webpage into clean, formatted markdown 
66- smartscraper: Extract structured data from any webpage using AI 
77- searchscraper: Perform AI-powered web searches with structured results 
8- - crawl_requester : Initiate intelligent web crawling requests (step 1)  
9- - crawl_fetcher: Fetch  results from crawling requests (step 2)  
8+ - smartcrawler_initiate : Initiate intelligent multi-page  web crawling with AI extraction or markdown conversion  
9+ - smartcrawler_fetch_results: Retrieve  results from asynchronous  crawling operations  
1010""" 
1111
1212import  os 
@@ -126,49 +126,58 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126126
127127        return  response .json ()
128128
129-     def  crawl_requester (
129+     def  smartcrawler_initiate (
130130        self , 
131131        url : str , 
132132        prompt : str  =  None , 
133-         cache_website :  bool  =  None ,
133+         extraction_mode :  str  =  "ai" ,
134134        depth : int  =  None ,
135135        max_pages : int  =  None ,
136-         same_domain_only : bool  =  None ,
137-         markdown_only : bool  =  None 
136+         same_domain_only : bool  =  None 
138137    ) ->  Dict [str , Any ]:
139138        """ 
140-         Initiate a web crawling request and get a request ID. 
139+         Initiate a SmartCrawler request for multi-page web crawling. 
140+          
141+         SmartCrawler supports two modes: 
142+         - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt 
143+         - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown 
144+ 
145+         Smartcrawler takes some time to process the request and returns the request id. 
146+         Use smartcrawler_fetch_results to get the results of the request. 
147+         You have to keep polling the smartcrawler_fetch_results until the request is complete. 
148+         The request is complete when the status is "completed". 
141149
142150        Args: 
143151            url: Starting URL to crawl 
144-             prompt: AI prompt for data extraction (optional, if not provided returns markdown only ) 
145-             cache_website: Whether to cache the website content (optional ) 
146-             depth: Maximum crawling  depth (optional) 
152+             prompt: AI prompt for data extraction (required for AI mode ) 
153+             extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" ) 
154+             depth: Maximum link traversal  depth (optional) 
147155            max_pages: Maximum number of pages to crawl (optional) 
148156            same_domain_only: Whether to crawl only within the same domain (optional) 
149-             markdown_only: Whether to return only markdown content without AI processing (optional) 
150157
151158        Returns: 
152-             Dictionary containing the request ID and status  
159+             Dictionary containing the request ID for async processing  
153160        """ 
154-         endpoint  =  f"{ self .BASE_URL } /requester " 
161+         endpoint  =  f"{ self .BASE_URL }  
155162        data  =  {
156163            "url" : url 
157164        }
158165
159-         # Add optional parameters if provided 
160-         if  prompt  is  not None :
166+         # Handle extraction mode 
167+         if  extraction_mode  ==  "markdown" :
168+             data ["markdown_only" ] =  True 
169+         elif  extraction_mode  ==  "ai" :
170+             if  prompt  is  None :
171+                 raise  ValueError ("prompt is required when extraction_mode is 'ai'" )
161172            data ["prompt" ] =  prompt 
162-         if   cache_website   is   not   None :
163-             data [ "cache_website" ]  =   cache_website 
173+         else :
174+             raise   ValueError ( f"Invalid extraction_mode:  { extraction_mode } . Must be 'ai' or 'markdown'" ) 
164175        if  depth  is  not None :
165176            data ["depth" ] =  depth 
166177        if  max_pages  is  not None :
167178            data ["max_pages" ] =  max_pages 
168179        if  same_domain_only  is  not None :
169180            data ["same_domain_only" ] =  same_domain_only 
170-         if  markdown_only  is  not None :
171-             data ["markdown_only" ] =  markdown_only 
172181
173182        response  =  self .client .post (endpoint , headers = self .headers , json = data )
174183
@@ -178,22 +187,27 @@ def crawl_requester(
178187
179188        return  response .json ()
180189
181-     def  crawl_fetcher (self , request_id : str ) ->  Dict [str , Any ]:
190+     def  smartcrawler_fetch_results (self , request_id : str ) ->  Dict [str , Any ]:
182191        """ 
183-         Fetch the results of a crawling request using the request ID . 
192+         Fetch the results of a SmartCrawler operation . 
184193
185194        Args: 
186-             request_id: The request ID returned by crawl_requester  
195+             request_id: The request ID returned by smartcrawler_initiate  
187196
188197        Returns: 
189-             Dictionary containing the crawl results or status 
198+             Dictionary containing the crawled data (structured extraction or markdown) 
199+             and metadata about processed pages 
200+ 
201+         Note: 
202+         It takes some time to process the request and returns the results. 
203+         Meanwhile it returns the status of the request. 
204+         You have to keep polling the smartcrawler_fetch_results until the request is complete. 
205+         The request is complete when the status is "completed". and you get results 
206+         Keep polling the smartcrawler_fetch_results until the request is complete. 
190207        """ 
191-         endpoint  =  f"{ self .BASE_URL }  
192-         data  =  {
193-             "request_id" : request_id 
194-         }
195- 
196-         response  =  self .client .post (endpoint , headers = self .headers , json = data )
208+         endpoint  =  f"{ self .BASE_URL } { request_id }  
209+         
210+         response  =  self .client .get (endpoint , headers = self .headers )
197211
198212        if  response .status_code  !=  200 :
199213            error_msg  =  f"Error { response .status_code } { response .text }  
@@ -291,66 +305,68 @@ def searchscraper(
291305        return  {"error" : str (e )}
292306
293307
294- # Add tool for crawl requester (smartcrawler step 1)  
308+ # Add tool for SmartCrawler initiation  
295309@mcp .tool () 
296- def  crawl_requester (
310+ def  smartcrawler_initiate (
297311    url : str ,
298312    prompt : str  =  None ,
299-     cache_website :  bool  =  None ,
313+     extraction_mode :  str  =  "ai" ,
300314    depth : int  =  None ,
301315    max_pages : int  =  None ,
302-     same_domain_only : bool  =  None ,
303-     markdown_only : bool  =  None 
316+     same_domain_only : bool  =  None 
304317) ->  Dict [str , Any ]:
305318    """ 
306-     Initiate a web crawling request and get a request ID. 
319+     Initiate a SmartCrawler request for intelligent multi-page web crawling. 
320+      
321+     SmartCrawler supports two modes: 
322+     - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt 
323+     - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown 
307324
308325    Args: 
309326        url: Starting URL to crawl 
310-         prompt: AI prompt for data extraction (optional, if not provided returns markdown only ) 
311-         cache_website: Whether to cache the website content (optional ) 
312-         depth: Maximum crawling  depth (optional) 
327+         prompt: AI prompt for data extraction (required for AI mode ) 
328+         extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" ) 
329+         depth: Maximum link traversal  depth (optional) 
313330        max_pages: Maximum number of pages to crawl (optional) 
314331        same_domain_only: Whether to crawl only within the same domain (optional) 
315-         markdown_only: Whether to return only markdown content without AI processing (optional) 
316332
317333    Returns: 
318-         Dictionary containing the request ID and status  
334+         Dictionary containing the request ID for async processing  
319335    """ 
320336    if  scrapegraph_client  is  None :
321337        return  {"error" : "ScapeGraph client not initialized. Please provide an API key." }
322338
323339    try :
324-         return  scrapegraph_client .crawl_requester (
340+         return  scrapegraph_client .smartcrawler_initiate (
325341            url = url ,
326342            prompt = prompt ,
327-             cache_website = cache_website ,
343+             extraction_mode = extraction_mode ,
328344            depth = depth ,
329345            max_pages = max_pages ,
330-             same_domain_only = same_domain_only ,
331-             markdown_only = markdown_only 
346+             same_domain_only = same_domain_only 
332347        )
333348    except  Exception  as  e :
334349        return  {"error" : str (e )}
335350
336351
337- # Add tool for crawl fetcher (smartcrawler step 2)  
352+ # Add tool for fetching SmartCrawler results  
338353@mcp .tool () 
339- def  crawl_fetcher (request_id : str ) ->  Dict [str , Any ]:
354+ def  smartcrawler_fetch_results (request_id : str ) ->  Dict [str , Any ]:
340355    """ 
341-     Fetch the results of a crawling request using the request ID . 
356+     Fetch the results of a SmartCrawler operation . 
342357
343358    Args: 
344-         request_id: The request ID returned by crawl_requester  
359+         request_id: The request ID returned by smartcrawler_initiate  
345360
346361    Returns: 
347-         Dictionary containing the crawl results or status 
362+         Dictionary containing the crawled data (structured extraction or markdown) 
363+         and metadata about processed pages 
348364    """ 
349365    if  scrapegraph_client  is  None :
350366        return  {"error" : "ScapeGraph client not initialized. Please provide an API key." }
351367
352368    try :
353-         return  scrapegraph_client .crawl_fetcher (request_id )
369+         return  scrapegraph_client .smartcrawler_fetch_results (request_id )
354370    except  Exception  as  e :
355371        return  {"error" : str (e )}
356372
0 commit comments