- Sponsor
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ability to transform proxied content #5
Comments
It would be fun to experiment with transforming streaming content - using |
Built this quick prototype with the help of ChatGPT: import asyncio
from html.parser import HTMLParser
import sys
class AddClassToPTagParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == "p":
attrs = dict(attrs)
if "class" in attrs:
if "foo" not in attrs["class"].split():
attrs["class"] += " foo"
else:
attrs["class"] = "foo"
attrs_str = " ".join([f'{k}="{v}"' for k, v in attrs.items()])
self.modified_html += f"<{tag} {attrs_str}>"
else:
self.modified_html += self.get_starttag_text()
def handle_endtag(self, tag):
self.modified_html += f"</{tag}>"
def handle_data(self, data):
self.modified_html += data
def handle_entityref(self, name):
self.modified_html += f"&{name};"
def handle_charref(self, name):
self.modified_html += f"&#${name};"
def feed(self, data):
self.modified_html = ""
super().feed(data)
return self.modified_html
async def transform_html(async_generator):
parser = AddClassToPTagParser()
async for chunk in async_generator:
yield parser.feed(chunk)
async def test():
async def html_generator():
chunks = [
"<html><head>",
"<title>Te",
"st</title></he",
"ad><body><p class=",
'"bar">Hello, ","world!</p>","<p>',
'Hello again, ","world!</p></body></html>',
]
for chunk in chunks:
yield chunk
await asyncio.sleep(0.5)
async for transformed_chunk in transform_html(html_generator()):
print(transformed_chunk, end='')
sys.stdout.flush()
print()
# Run the test coroutine
asyncio.run(test()) |
For import ijson
events = ijson.sendable_list()
coro = ijson.items_coro(events, 'earth.europe.item')
f = urlopen('http://.../')
for chunk in iter(functools.partial(f.read, buf_size)):
coro.send(chunk)
process_accumulated_events(events)
del events[:]
coro.close()
process_accumulated_events(events) |
import asyncio
import ijson
def remove_keys(obj):
"""Remove keys starting with '_' from an object"""
if isinstance(obj, dict):
return {
k: remove_keys(v) for k, v in obj.items() if not k.startswith("_")
}
elif isinstance(obj, list):
return [remove_keys(item) for item in obj]
else:
return obj
async def transform_json(async_generator):
events = ijson.sendable_list()
coro = ijson.items_coro(events, "item")
async for chunk in async_generator:
coro.send(chunk)
while events:
transformed_item = remove_keys(events.pop(0))
yield transformed_item
coro.close()
async def test():
async def json_stream():
chunks = [
b'[{"item": {"_id": 1, "name": "test1"}},',
b'{"item": {"_id": 2, "name": "test2"}},',
b'{"item": {"_id": 3, "name": "test3"}}]',
]
for chunk in chunks:
yield chunk
await asyncio.sleep(0.1)
async for transformed_item in transform_json(json_stream()):
print(transformed_item)
# Run the test coroutine
asyncio.run(test()) Note that this outputs:
Losing the ChatGPT transcript that got me here: https://chat.openai.com/share/3461da01-6e49-4324-9ece-cc2be1134f04 |
So I think the interface for this ends up looking something like this: # As seen above:
async def transform_html(async_generator):
parser = AddClassToPTagParser()
async for chunk in async_generator:
yield parser.feed(chunk)
app = asgi_proxy("https://datasette.io", body_transformer=transform_html) This may be too simple though, since the transformer should probably take into account things like the Maybe something like this instead: def transform_factory(httpx_response):
# At this point we just have the headers
if 'text/html' in httpx_response.headers.get('content-type'):
return transform_html
app = asgi_proxy("https://datasette.io", body_transform_factory=transform_factory) |
Right now this proxy isn't useful for anything other than forwarding traffic to somewhere else.
Including patterns for transforming proxied content would be really useful.
Some transformations that I would want to support:
A challenge with content transformations is that they are harder to implement in a streaming fashion - often they'll need to accumulate the entire response body before being applied. Supporting a neat pattern for optionally doing that would be useful too.
The text was updated successfully, but these errors were encountered: