Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add format option with raw HTML to URL component #3762

Merged
merged 2 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/backend/base/langflow/components/data/URL.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re

from langchain_community.document_loaders.web_base import WebBaseLoader
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader

from langflow.helpers.data import data_to_text
from langflow.custom import Component
from langflow.io import MessageTextInput, Output
from langflow.io import DropdownInput, MessageTextInput, Output
from langflow.schema import Data
from langflow.schema.message import Message

Expand All @@ -22,6 +22,13 @@ class URLComponent(Component):
info="Enter one or more URLs, by clicking the '+' button.",
is_list=True,
),
DropdownInput(
name="format",
display_name="Output format",
info="Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
options=["Text", "Raw HTML"],
value="Text",
),
]

outputs = [
Expand Down Expand Up @@ -64,7 +71,10 @@ def ensure_url(self, string: str) -> str:

def fetch_content(self) -> list[Data]:
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
if self.format == "Raw HTML":
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
else:
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
docs = loader.load()
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]
self.status = data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,27 @@
"show": true,
"title_case": false,
"type": "code",
"value": "import re\n\nfrom langchain_community.document_loaders.web_base import WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n"
"value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output format\",\n info=\"Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n"
},
"format": {
"_input_type": "DropdownInput",
"advanced": false,
"combobox": false,
"display_name": "Output format",
"dynamic": false,
"info": "Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
"name": "format",
"options": [
"Text",
"Raw HTML"
],
"placeholder": "",
"required": false,
"show": true,
"title_case": false,
"trace_as_metadata": true,
"type": "str",
"value": "Text"
},
"urls": {
"advanced": false,
Expand Down
Loading