Add example of adding streaming support to run llm with transformers (#…

…68) * Add example of adding streaming support to run llm with transformers * lint
leptonai · Mar 29, 2024 · da6ee36 · da6ee36
1 parent 2f3443e
commit da6ee36
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 7 deletions.
diff --git a/advanced/earning-sage/main.py b/advanced/earning-sage/main.py
@@ -61,9 +61,11 @@ def ui(self):
 
  with blocks:
  gr.Markdown("# 🧙🏼 Earning Report Assistant")
- gr.Markdown("""
+ gr.Markdown(
+ """
  This is an earning report assistant built for investors can't make the earning call on time. This sample is using Apple 2023 Q2 report. Feel free to reach out to uz@lepton.ai for more advanced features.
- """)
+ """
+ )
  with gr.Row():
  chatbot = gr.Chatbot(label="Model")
  with gr.Row():

diff --git a/advanced/hf-stream-llm/photon.py b/advanced/hf-stream-llm/photon.py
@@ -0,0 +1,62 @@
+import os
+from threading import Thread
+from queue import Queue
+
+from loguru import logger
+from leptonai.photon import Photon, StreamingResponse
+
+
+class HfStreamLLM(Photon):
+
+ deployment_template = {
+ "resource_shape": "gpu.a10.6xlarge",
+ "env": {
+ "MODEL_PATH": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+ },
+ "secret": [
+ "HUGGING_FACE_HUB_TOKEN",
+ ],
+ }
+
+ requirement_dependency = [
+ "transformers",
+ ]
+
+ handler_max_concurrency = 4
+
+ def init(self):
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ model_path = os.environ["MODEL_PATH"]
+
+ self._tok = AutoTokenizer.from_pretrained(model_path)
+ self._model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
+
+ self._generation_queue = Queue()
+
+ for _ in range(self.handler_max_concurrency):
+ Thread(target=self._generate, daemon=True).start()
+
+ def _generate(self):
+ while True:
+ streamer, args, kwargs = self._generation_queue.get()
+ try:
+ self._model.generate(*args, **kwargs)
+ except Exception as e:
+ logger.error(f"Error in generation: {e}")
+ streamer.text_queue.put(streamer.stop_signal)
+
+ @Photon.handler
+ def run(self, text: str, max_new_tokens: int = 100) -> StreamingResponse:
+ from transformers import TextIteratorStreamer
+
+ streamer = TextIteratorStreamer(self._tok, skip_prompt=True, timeout=60)
+ inputs = self._tok(text, return_tensors="pt").to("cuda")
+ self._generation_queue.put_nowait(
+ (
+ streamer,
+ (),
+ dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens),
+ )
+ )
+ return streamer
diff --git a/advanced/segment-anything/sam.py b/advanced/segment-anything/sam.py
@@ -229,11 +229,13 @@ def generate_mask(self, url: str) -> PNGResponse:
  # The below rendering code is copied from the segment-anything repo to draw the mask
  # on top of the original image.
  sorted_anns = sorted(masks, key=(lambda x: x["area"]), reverse=True)
- mask_img = np.ones((
- sorted_anns[0]["segmentation"].shape[0],
- sorted_anns[0]["segmentation"].shape[1],
- 3,
- ))
+ mask_img = np.ones(
+ (
+ sorted_anns[0]["segmentation"].shape[0],
+ sorted_anns[0]["segmentation"].shape[1],
+ 3,
+ )
+ )
  for ann in sorted_anns:
  mask_img[ann["segmentation"]] = np.random.random(3)
  alpha = 0.35