From 03700116261110177f15bddccbbb02401f1c5a4e Mon Sep 17 00:00:00 2001 From: matt cameron Date: Sun, 10 Nov 2024 19:02:40 +1100 Subject: [PATCH] feat: add Beautiful Soup plugin #884 --- .../action/v1/beautifulsoup_action.py | 113 ++++++++++++++++++ tracardi/service/setup/setup_plugins.py | 12 +- 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tracardi/process_engine/action/v1/beautifulsoup_action.py diff --git a/tracardi/process_engine/action/v1/beautifulsoup_action.py b/tracardi/process_engine/action/v1/beautifulsoup_action.py new file mode 100644 index 00000000..386fefec --- /dev/null +++ b/tracardi/process_engine/action/v1/beautifulsoup_action.py @@ -0,0 +1,113 @@ +from lxml import etree +from tracardi.service.plugin.domain.register import Plugin, Spec, MetaData, Documentation, PortDoc, Form, FormGroup, \ + FormField, FormComponent +from tracardi.service.plugin.domain.result import Result +from tracardi.service.plugin.domain.config import PluginConfig +from tracardi.service.plugin.runner import ActionRunner +from pydantic import field_validator +from tracardi.domain.profile import Profile +from tracardi.service.tracardi_http_client import HttpClient + +from bs4 import BeautifulSoup + +class Configuration(PluginConfig): + html: str + method: str = "get_text" + + @field_validator('html') + @classmethod + def html_must_not_be_empty(cls, value): + if value.strip() == "": + raise ValueError("HTML must not be empty.") + return value + +def validate(config: dict): + return Configuration(**config) + +class BeautifulSoupAction(ActionRunner): + + config: Configuration + + async def set_up(self, init): + self.config = validate(init) + + async def run(self, payload: dict, in_edge=None) -> Result: + + dot = self._get_dot_accessor(payload) + + try: + + html=dot[self.config.html] + soup = BeautifulSoup(html, 'html.parser') + + text = '' + + if self.config.method == "get_text": + text = soup.get_text() + else: + raise ValueError(f"Unsupported method: {self.config.method}") + + return Result(port='result', value={'text':text}) + + except Exception as e: + return Result(value={"message": str(e)}, port="error") + +def register() -> Plugin: + return Plugin( + start=False, + spec=Spec( + module=__name__, + className=BeautifulSoupAction.__name__, + inputs=["payload"], + outputs=["result", "error"], + version="1.0.4", + init={ + "html": "", + }, + form=Form(groups=[ + FormGroup( + name="Beautiful Soup configuration", + fields=[ + FormField( + id="html", + name="HTML", + description="The HTML to be converted.", + component=FormComponent(type="dotPath", props={ + "label": "HTML" + }), + ), + FormField( + id="method", + name="Method", + description="The BeautifulSoup method to apply to the HTML", + component=FormComponent(type="select", props={ + "label": "Method", + "items": { + "get_text": "get_text", + } + }) + ), + ]), + ]), + license="MIT", + author="Matt Cameron", + manual="beautifulsoup", + + ), + metadata=MetaData( + name='BeautifulSoup', + desc='Converts HTML to text.', + icon='BeautifulSoup', + group=['Data Processing'], + documentation=Documentation( + inputs={ + "payload": PortDoc(desc="This port takes payload object.") + }, + outputs={ + "result": PortDoc(desc="Returns response from Sitemap service."), + "error": PortDoc(desc="Returns error message if plugin fails.") + } + ) + ) + ) + diff --git a/tracardi/service/setup/setup_plugins.py b/tracardi/service/setup/setup_plugins.py index 7fa60548..307dcd1f 100644 --- a/tracardi/service/setup/setup_plugins.py +++ b/tracardi/service/setup/setup_plugins.py @@ -801,7 +801,17 @@ }, resource=None), ), - + + "tracardi.process_engine.action.v1.beautifulsoup_action": PluginMetadata( + test=PluginTest( + init={ + 'HTML': '', + 'method': 'get_text', + + }, + resource=None), + ), + } if License.has_service(SCHEDULER):