Skip to content

Commit 3490084

Browse files
authored
feat: add Beautiful Soup plugin (#885)
#884
1 parent cf32a9e commit 3490084

File tree

2 files changed

+124
-1
lines changed

2 files changed

+124
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from lxml import etree
2+
from tracardi.service.plugin.domain.register import Plugin, Spec, MetaData, Documentation, PortDoc, Form, FormGroup, \
3+
FormField, FormComponent
4+
from tracardi.service.plugin.domain.result import Result
5+
from tracardi.service.plugin.domain.config import PluginConfig
6+
from tracardi.service.plugin.runner import ActionRunner
7+
from pydantic import field_validator
8+
from tracardi.domain.profile import Profile
9+
from tracardi.service.tracardi_http_client import HttpClient
10+
11+
from bs4 import BeautifulSoup
12+
13+
class Configuration(PluginConfig):
14+
html: str
15+
method: str = "get_text"
16+
17+
@field_validator('html')
18+
@classmethod
19+
def html_must_not_be_empty(cls, value):
20+
if value.strip() == "":
21+
raise ValueError("HTML must not be empty.")
22+
return value
23+
24+
def validate(config: dict):
25+
return Configuration(**config)
26+
27+
class BeautifulSoupAction(ActionRunner):
28+
29+
config: Configuration
30+
31+
async def set_up(self, init):
32+
self.config = validate(init)
33+
34+
async def run(self, payload: dict, in_edge=None) -> Result:
35+
36+
dot = self._get_dot_accessor(payload)
37+
38+
try:
39+
40+
html=dot[self.config.html]
41+
soup = BeautifulSoup(html, 'html.parser')
42+
43+
text = ''
44+
45+
if self.config.method == "get_text":
46+
text = soup.get_text()
47+
else:
48+
raise ValueError(f"Unsupported method: {self.config.method}")
49+
50+
return Result(port='result', value={'text':text})
51+
52+
except Exception as e:
53+
return Result(value={"message": str(e)}, port="error")
54+
55+
def register() -> Plugin:
56+
return Plugin(
57+
start=False,
58+
spec=Spec(
59+
module=__name__,
60+
className=BeautifulSoupAction.__name__,
61+
inputs=["payload"],
62+
outputs=["result", "error"],
63+
version="1.0.4",
64+
init={
65+
"html": "",
66+
},
67+
form=Form(groups=[
68+
FormGroup(
69+
name="Beautiful Soup configuration",
70+
fields=[
71+
FormField(
72+
id="html",
73+
name="HTML",
74+
description="The HTML to be converted.",
75+
component=FormComponent(type="dotPath", props={
76+
"label": "HTML"
77+
}),
78+
),
79+
FormField(
80+
id="method",
81+
name="Method",
82+
description="The BeautifulSoup method to apply to the HTML",
83+
component=FormComponent(type="select", props={
84+
"label": "Method",
85+
"items": {
86+
"get_text": "get_text",
87+
}
88+
})
89+
),
90+
]),
91+
]),
92+
license="MIT",
93+
author="Matt Cameron",
94+
manual="beautifulsoup",
95+
96+
),
97+
metadata=MetaData(
98+
name='BeautifulSoup',
99+
desc='Converts HTML to text.',
100+
icon='BeautifulSoup',
101+
group=['Data Processing'],
102+
documentation=Documentation(
103+
inputs={
104+
"payload": PortDoc(desc="This port takes payload object.")
105+
},
106+
outputs={
107+
"result": PortDoc(desc="Returns response from Sitemap service."),
108+
"error": PortDoc(desc="Returns error message if plugin fails.")
109+
}
110+
)
111+
)
112+
)
113+

tracardi/service/setup/setup_plugins.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -801,7 +801,17 @@
801801
},
802802
resource=None),
803803
),
804-
804+
805+
"tracardi.process_engine.action.v1.beautifulsoup_action": PluginMetadata(
806+
test=PluginTest(
807+
init={
808+
'HTML': '',
809+
'method': 'get_text',
810+
811+
},
812+
resource=None),
813+
),
814+
805815
}
806816

807817
if License.has_service(SCHEDULER):

0 commit comments

Comments
 (0)