-
Notifications
You must be signed in to change notification settings - Fork 2k
/
web_base_loader.py
43 lines (34 loc) · 1.53 KB
/
web_base_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
from ..utils import get_relevant_images, extract_title
class WebBaseLoaderScraper:
def __init__(self, link, session=None):
self.link = link
self.session = session or requests.Session()
def scrape(self) -> tuple:
"""
This Python function scrapes content from a webpage using a WebBaseLoader object and returns the
concatenated page content.
Returns:
The `scrape` method is returning a string variable named `content` which contains the
concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception
occurs during the process, an error message is printed and an empty string is returned.
"""
try:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(self.link)
loader.requests_kwargs = {"verify": False}
docs = loader.load()
content = ""
for doc in docs:
content += doc.page_content
response = self.session.get(self.link)
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = get_relevant_images(soup, self.link)
# Extract the title using the utility function
title = extract_title(soup)
return content, image_urls, title
except Exception as e:
print("Error! : " + str(e))
return "", [], ""