TabbyML · moqimoqidea · Jul 8, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/website/blog/2024-07-10-enhanced-tabby-deployment-on-modal/app.py b/website/blog/2024-07-10-enhanced-tabby-deployment-on-modal/app.py
@@ -0,0 +1,133 @@
+"""Usage:
+modal serve app.py
+
+To force a rebuild by pulling the latest image tag, use:
+MODAL_FORCE_BUILD=1 modal serve app.py
+"""
+
+import os
+
+from modal import Image, App, asgi_app, gpu, Volume
+
+IMAGE_NAME = "tabbyml/tabby"
+MODEL_ID = "TabbyML/StarCoder-1B"
+CHAT_MODEL_ID = "TabbyML/Qwen2-1.5B-Instruct"
+EMBEDDING_MODEL_ID = "TabbyML/Nomic-Embed-Text"
+GPU_CONFIG = gpu.T4()
+
+TABBY_BIN = "/opt/tabby/bin/tabby"
+TABBY_ENV = os.environ.copy()
+TABBY_ENV['TABBY_MODEL_CACHE_ROOT'] = '/models'
+
+
+def download_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+def download_chat_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            CHAT_MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+def download_embedding_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            EMBEDDING_MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+image = (
+    Image.from_registry(
+        IMAGE_NAME,
+        add_python="3.11",
+    )
+    .dockerfile_commands("ENTRYPOINT []")
+    .run_function(download_model)
+    .run_function(download_chat_model)
+    .run_function(download_embedding_model)
+    .pip_install("asgi-proxy-lib")
+)
+
+app = App("tabby-server", image=image)
+
+data_volume = Volume.from_name("tabby-data", create_if_missing=True)
+data_dir = "/data"
+
+@app.function(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=10,
+    container_idle_timeout=120,
+    timeout=360,
+    volumes={data_dir: data_volume},
+    _allow_background_volume_commits=True,
+    concurrency_limit=1,
+)
+@asgi_app()
+def app_serve():
+    import socket
+    import subprocess
+    import time
+    from asgi_proxy import asgi_proxy
+
+    launcher = subprocess.Popen(
+        [
+            TABBY_BIN,
+            "serve",
+            "--model",
+            MODEL_ID,
+            "--chat-model",
+            CHAT_MODEL_ID,
+            "--port",
+            "8000",
+            "--device",
+            "cuda",
+            "--parallelism",
+            "1",
+        ],
+        env=TABBY_ENV,
+    )
+
+    # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
+    def tabby_ready():
+        try:
+            socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
+            return True
+        except (socket.timeout, ConnectionRefusedError):
+            # Check if launcher webserving process has exited.
+            # If so, a connection can never be made.
+            retcode = launcher.poll()
+            if retcode is not None:
+                raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
+            return False
+
+    while not tabby_ready():
+        time.sleep(1.0)
+
+    print("Tabby server ready!")
+    return asgi_proxy("http://localhost:8000")
diff --git a/website/blog/2024-07-10-enhanced-tabby-deployment-on-modal/index.md b/website/blog/2024-07-10-enhanced-tabby-deployment-on-modal/index.md
@@ -0,0 +1,127 @@
+---
+title: Enhanced Tabby Deployment on Modal
+authors:
+  - name: moqimoqidea
+    url: https://github.com/moqimoqidea
+    image_url: https://github.com/moqimoqidea
+tags: [deployment]
+---
+
+# Enhanced Tabby Deployment on Modal: Utilizing Persistent Volumes and Model Caching
+
+In this post, we delve into recent enhancements to Tabby's deployment on Modal, focusing on model caching and the use of persistent volumes. These upgrades significantly improve both scalability and usability within serverless environments.
+
+## Understanding Model Caching
+
+Model caching is a key upgrade in our deployment strategy, offering substantial benefits:
+
+1. **Scalability and Speed:** By storing large model files in the image layer, we eliminate the need for re-downloading upon each container startup. This efficiency reduces startup and shutdown times, ensuring our service is both responsive and cost-effective, perfect for Function as a Service (FaaS) scenarios. Further details on image caching are available in Modal's [Image caching and rebuilds guide](https://modal.com/docs/guide/custom-container#image-caching-and-rebuilds).
+
+2. **Efficiency:** Model caching cuts down the time and resources used to fetch and load models, which is crucial in environments requiring rapid scaling.
+
+### Implementing Model Caching
+
+Here’s how we utilize Modal’s image caching to expedite deployment and service scaling:
+
+```python
+def download_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+def download_chat_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            CHAT_MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+def download_embedding_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            EMBEDDING_MODEL_ID,
+        ],
+        env=TABBY_ENV,
+    )
+
+
+image = (
+    Image.from_registry(
+        IMAGE_NAME,
+        add_python="3.11",
+    )
+    .dockerfile_commands("ENTRYPOINT []")
+    .run_function(download_model)
+    .run_function(download_chat_model)
+    .run_function(download_embedding_model)
+    .pip_install("asgi-proxy-lib")
+)
+
+app = App("tabby-server", image=image)
+```
+
+Modal determines the necessity to rebuild an image based on its definition changes. If unchanged, Modal pulls the previous version from cache, optimizing deployment processes.
+
+## The Role of Persistent Volumes
+
+Persistent volumes tackle several challenges inherent in FaaS environments:
+
+1. **Data Persistence:** Frequent container startups and shutdowns typically disrupt user data and configuration continuity. Persistent volumes maintain this data intact across sessions. For more insights, check Modal’s [guide on persisting volumes](https://modal.com/docs/guide/volumes#persisting-volumes).
+
+2. **User Experience:** Synchronized configuration files and essential data through persistent volumes eliminate the need for users to repeatedly configure settings, enhancing user experience and reliability.
+
+3. **Operational Stability:** Providing a stable storage solution, persistent volumes are crucial for maintaining service reliability amidst frequent container cycling.
+
+### Implementing Persistent Volumes
+
+Here's our approach to utilizing Modal's persistent volumes to ensure data consistency and independence from the container lifecycle:
+
+```python
+data_volume = Volume.from_name("tabby-data", create_if_missing=True)
+data_dir = "/data"
+
+@app.function(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=10,
+    container_idle_timeout=120,
+    timeout=360,
+    volumes={data_dir: data_volume},
+    _allow_background_volume_commits=True,
+    concurrency_limit=1,
+)
+```
+
+The `create_if_missing=True` parameter lazily creates a volume if it doesn’t exist, while `_allow_background_volume_commits` allows for automatic data snapshotting and committing at regular intervals and upon container shutdown.
+
+## The Complete App.py
+
+The `app.py` script centralizes all configurations, model management, and service functionalities, making it a core component of our Modal deployment, you can see it in [Tabby GitHub repository](https://github.com/TabbyML/tabby/blob/main/website/blog/2024-07-10-enhanced-tabby-deployment-on-modal/app.py).
+
+## Conclusion
+
+These strategic enhancements on Modal not only optimize operational aspects but also significantly boost user experience by providing quicker startup times and reliable data persistence. By integrating model caching and persistent volumes, Tabby remains a sturdy and efficient tool in the ever-evolving serverless landscape.
+
+For a deeper dive into these strategies, we recommend our [detailed tutorial](https://github.com/TabbyML/tabby/blob/main/website/docs/quick-start/installation/modal/index.md), which outlines a comprehensive guide to setting up your Tabby instance with these advanced features.
+
+We hope this update inspires you to enhance your deployments similarly. Stay tuned for more updates, and happy coding with Tabby!
diff --git a/website/docs/quick-start/installation/modal/app-running.png b/website/docs/quick-start/installation/modal/app-running.png
diff --git a/website/docs/quick-start/installation/modal/app.py b/website/docs/quick-start/installation/modal/app.py
@@ -1,65 +1,102 @@
 """Usage:
 modal serve app.py
+
+To force a rebuild by pulling the latest image tag, use:
+MODAL_FORCE_BUILD=1 modal serve app.py
 """
 
-from modal import Image, Stub, asgi_app, gpu
+from modal import Image, App, asgi_app, gpu
 
 IMAGE_NAME = "tabbyml/tabby"
 MODEL_ID = "TabbyML/StarCoder-1B"
+CHAT_MODEL_ID = "TabbyML/Qwen2-1.5B-Instruct"
+EMBEDDING_MODEL_ID = "TabbyML/Nomic-Embed-Text"
 GPU_CONFIG = gpu.T4()
 
+TABBY_BIN = "/opt/tabby/bin/tabby"
+
 
 def download_model():
     import subprocess
 
     subprocess.run(
         [
-            "/opt/tabby/bin/tabby-cpu",
+            TABBY_BIN,
             "download",
             "--model",
             MODEL_ID,
         ]
     )
 
 
+def download_chat_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            CHAT_MODEL_ID,
+        ]
+    )
+
+
+def download_embedding_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            EMBEDDING_MODEL_ID,
+        ]
+    )
+
+
 image = (
     Image.from_registry(
         IMAGE_NAME,
         add_python="3.11",
     )
     .dockerfile_commands("ENTRYPOINT []")
     .run_function(download_model)
+    .run_function(download_chat_model)
+    .run_function(download_embedding_model)
     .pip_install("asgi-proxy-lib")
 )
 
-stub = Stub("tabby-server-" + MODEL_ID.split("/")[-1], image=image)
+app = App("tabby-server", image=image)
 
 
-@stub.function(
+@app.function(
     gpu=GPU_CONFIG,
     allow_concurrent_inputs=10,
     container_idle_timeout=120,
     timeout=360,
 )
 @asgi_app()
-def app():
+def app_serve():
     import socket
     import subprocess
     import time
     from asgi_proxy import asgi_proxy
 
     launcher = subprocess.Popen(
         [
-            "/opt/tabby/bin/tabby",
+            TABBY_BIN,
             "serve",
             "--model",
             MODEL_ID,
+            "--chat-model",
+            CHAT_MODEL_ID,
             "--port",
             "8000",
             "--device",
             "cuda",
             "--parallelism",
-            "4",
+            "1",
         ]
     )