Skip to content

Commit

Permalink
Add example of adding streaming support to run llm with transformers (#…
Browse files Browse the repository at this point in the history
…68)

* Add example of adding streaming support to run llm with transformers

* lint
  • Loading branch information
bddppq authored Mar 29, 2024
1 parent 2f3443e commit da6ee36
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 7 deletions.
6 changes: 4 additions & 2 deletions advanced/earning-sage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,11 @@ def ui(self):

with blocks:
gr.Markdown("# 🧙🏼 Earning Report Assistant")
gr.Markdown("""
gr.Markdown(
"""
This is an earning report assistant built for investors can't make the earning call on time. This sample is using Apple 2023 Q2 report. Feel free to reach out to uz@lepton.ai for more advanced features.
""")
"""
)
with gr.Row():
chatbot = gr.Chatbot(label="Model")
with gr.Row():
Expand Down
62 changes: 62 additions & 0 deletions advanced/hf-stream-llm/photon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
from threading import Thread
from queue import Queue

from loguru import logger
from leptonai.photon import Photon, StreamingResponse


class HfStreamLLM(Photon):

deployment_template = {
"resource_shape": "gpu.a10.6xlarge",
"env": {
"MODEL_PATH": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
},
"secret": [
"HUGGING_FACE_HUB_TOKEN",
],
}

requirement_dependency = [
"transformers",
]

handler_max_concurrency = 4

def init(self):
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = os.environ["MODEL_PATH"]

self._tok = AutoTokenizer.from_pretrained(model_path)
self._model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

self._generation_queue = Queue()

for _ in range(self.handler_max_concurrency):
Thread(target=self._generate, daemon=True).start()

def _generate(self):
while True:
streamer, args, kwargs = self._generation_queue.get()
try:
self._model.generate(*args, **kwargs)
except Exception as e:
logger.error(f"Error in generation: {e}")
streamer.text_queue.put(streamer.stop_signal)

@Photon.handler
def run(self, text: str, max_new_tokens: int = 100) -> StreamingResponse:
from transformers import TextIteratorStreamer

streamer = TextIteratorStreamer(self._tok, skip_prompt=True, timeout=60)
inputs = self._tok(text, return_tensors="pt").to("cuda")
self._generation_queue.put_nowait(
(
streamer,
(),
dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens),
)
)
return streamer
12 changes: 7 additions & 5 deletions advanced/segment-anything/sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,13 @@ def generate_mask(self, url: str) -> PNGResponse:
# The below rendering code is copied from the segment-anything repo to draw the mask
# on top of the original image.
sorted_anns = sorted(masks, key=(lambda x: x["area"]), reverse=True)
mask_img = np.ones((
sorted_anns[0]["segmentation"].shape[0],
sorted_anns[0]["segmentation"].shape[1],
3,
))
mask_img = np.ones(
(
sorted_anns[0]["segmentation"].shape[0],
sorted_anns[0]["segmentation"].shape[1],
3,
)
)
for ann in sorted_anns:
mask_img[ann["segmentation"]] = np.random.random(3)
alpha = 0.35
Expand Down

0 comments on commit da6ee36

Please sign in to comment.