Skip to content

Commit

Permalink
Merge pull request #843 from garylin2099/ci_dev
Browse files Browse the repository at this point in the history
update webscraping tool
  • Loading branch information
garylin2099 authored Feb 5, 2024
2 parents 23c2762 + 9b72370 commit 4dd4695
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 6 deletions.
2 changes: 1 addition & 1 deletion examples/crawl_webpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

async def main():
prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*"""
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*"""
ci = CodeInterpreter(goal=prompt, use_tools=True)

await ci.run(prompt)
Expand Down
9 changes: 4 additions & 5 deletions metagpt/tools/libs/web_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@


@register_tool(tool_type=ToolType.WEBSCRAPING.type_name)
async def scrape_web_playwright(url, *urls):
async def scrape_web_playwright(url):
"""
Scrape and save the HTML structure and inner text content of a web page using Playwright.
Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright.
Args:
url (str): The main URL to fetch inner text from.
*urls (str): Additional URLs to fetch inner text from.
Returns:
(dict): The inner text content and html structure of the web page, key are : 'inner_text', 'html'.
dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.
"""
# Create a PlaywrightWrapper instance for the Chromium browser
web = await PlaywrightWrapper().run(url, *urls)
web = await PlaywrightWrapper().run(url)

# Return the inner text content of the web page
return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}

0 comments on commit 4dd4695

Please sign in to comment.