diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f2be5cf6..e2e8d797f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,7 @@ repos:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
+        args: ['--allow-multiple-documents']
     -   id: check-added-large-files
 -   repo: https://github.com/psf/black
     rev: 22.10.0
diff --git a/docs/shortfin/llm/user/llama_end_to_end.md b/docs/shortfin/llm/user/llama_serving.md
similarity index 67%
rename from docs/shortfin/llm/user/llama_end_to_end.md
rename to docs/shortfin/llm/user/llama_serving.md
index a74851407..cc2c959b4 100644
--- a/docs/shortfin/llm/user/llama_end_to_end.md
+++ b/docs/shortfin/llm/user/llama_serving.md
@@ -272,3 +272,32 @@ If you want to find the process again:
 ```bash
 ps -f | grep shortfin
 ```
+
+## Server Options
+
+To run the server with different options, you can use the
+following command to see the available flags:
+
+```bash
+python -m shortfin_apps.llm.server --help
+```
+
+### Server Options
+
+A full list of options can be found below:
+
+| Argument                                        | Description                                                                                                                                                                         |
+| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--host HOST`                                   | Specify the host to bind the server.                                                                                                                                                |
+| `--port PORT`                                   | Specify the port to bind the server.                                                                                                                                                |
+| `--root-path ROOT_PATH`                         | Root path to use for installing behind a path-based proxy.                                                                                                                          |
+| `--timeout-keep-alive TIMEOUT_KEEP_ALIVE`       | Keep-alive timeout duration.                                                                                                                                                        |
+| `--tokenizer_json TOKENIZER_JSON`               | Path to a `tokenizer.json` file.                                                                                                                                                    |
+| `--tokenizer_config_json TOKENIZER_CONFIG_JSON` | Path to a `tokenizer_config.json` file.                                                                                                                                             |
+| `--model_config MODEL_CONFIG`                   | Path to the model config file.                                                                                                                                                      |
+| `--vmfb VMFB`                                   | Model [VMFB](https://iree.dev/developers/general/developer-tips/#inspecting-vmfb-files) to load.                                                                                    |
+| `--parameters [FILE ...]`                       | Parameter archives to load (supports: `gguf`, `irpa`, `safetensors`).                                                                                                               |
+| `--device {local-task,hip,amdgpu}`              | Device to serve on (e.g., `local-task`, `hip`). Same options as [iree-run-module --list_drivers](https://iree.dev/guides/deployment-configurations/gpu-rocm/#get-the-iree-runtime). |
+| `--device_ids [DEVICE_IDS ...]`                 | Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a device ID like `amdgpu:0:0@0`.                                                   |
+| `--isolation {none,per_fiber,per_call}`         | Concurrency control: How to isolate programs.                                                                                                                                       |
+| `--amdgpu_async_allocations`                    | Enable asynchronous allocations for AMD GPU device contexts.                                                                                                                        |
diff --git a/docs/shortfin/llm/user/llama_serving_on_kubernetes.md b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
new file mode 100644
index 000000000..f573bd8ae
--- /dev/null
+++ b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
@@ -0,0 +1,44 @@
+# Llama 8b GPU instructions on Kubernetes
+
+## Setup
+
+We will use an example with `llama_8b_f16` in order to describe the
+process of exporting a model and deploying four instances of a shortfin llm server
+behind a load balancer on MI300X GPU.
+
+### Pre-Requisites
+
+- Kubernetes cluster available to use
+- kubectl installed on system and configured for cluster of interest
+    - To install kubectl, please check out [kubectl install](https://kubernetes.io/docs/tasks/tools/#kubectl)
+    and make sure to set the `KUBECONFIG` environment variable to point to your kube config file to authorize
+    connection to the cluster.
+
+### Deploy shortfin llama app service
+
+To generate the artifacts required for this k8s deployment, please follow [llama_serving.md](./llama_serving.md) until you have have all of the files that we need to run the shortfin LLM server.
+Please upload your artifacts to a storage option that you can pull from in your k8s cluster (NFS, S3, CSP).
+Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts you just stored and change flags to intended configuration.
+
+To deploy llama app:
+
+```
+kubectl apply -f llama-app-deployment.yaml
+```
+
+To retrieve external IP for targetting the llama app load balancer:
+
+```
+kubectl get service shark-llama-app-service
+```
+
+Now, you can use the external IP for sglang integration or just sending text generation requests.
+
+### Delete shortfin llama app service
+
+After done using, make sure to delete:
+
+```
+kubectl delete deployment shark-llama-app-deployment
+kubectl delete service shark-llama-app-service
+```
diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index 1292eba66..3812b5277 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -4,15 +4,15 @@ This doc includes basic steps for hooking up sglang with a running Shortfin serv
 
 ## Current Support Status
 
-| Feature     | Description | Enabled    | Reference |
-| ----------- | ----------- | ---------- | ------------ |
-| `gen`       | Generate shortfin completion, given a prompt | ✅ | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) |
-| `streaming` | Stream shortfin completion, given a prompt | ✅ | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming) |
-| `run_batch` | Run batch of disjoint requests with continous batching | ✅ | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching) |
-| `fork`      | Generate sections of the same prompt in parallel | ✅ | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism) |
-| `choices`   | Given set of choices, generate response based on best log probs | ❌ | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang) |
-| `image`     | Pass image as part of multi-modal prompt | ❌ | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality) |
-| `regex`     | Specify regular expression as decoding constraint | ❌ | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding) |
+| Feature     | Description                                                     | Enabled | Reference                                                                                                    |
+| ----------- | --------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------ |
+| `gen`       | Generate shortfin completion, given a prompt                    | ✅       | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) |
+| `streaming` | Stream shortfin completion, given a prompt                      | ✅       | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming)                                  |
+| `run_batch` | Run batch of disjoint requests with continous batching          | ✅       | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching)                                    |
+| `fork`      | Generate sections of the same prompt in parallel                | ✅       | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism)                                |
+| `choices`   | Given set of choices, generate response based on best log probs | ❌       | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang)     |
+| `image`     | Pass image as part of multi-modal prompt                        | ❌       | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality)                             |
+| `regex`     | Specify regular expression as decoding constraint               | ❌       | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding)                           |
 
 ## Prerequisites
 
@@ -24,20 +24,22 @@ For this tutorial, you will need to meet the following prerequisites:
     - You can check out [pyenv](https://github.com/pyenv/pyenv)
     as a good tool to be able to manage multiple versions of python
     on the same system.
-- A running `shortfin` LLM server as described [below](#installstart-shortfin-llm-server)
+
+### Shortfin LLM Server
+
+- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_serving.md) and for launching
+on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./llama_serving_on_kubernetes.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.
 
-### Hardware
-
-- This tutorial is designed to run on an [AMD MI300X GPU](https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html)
-
-## Install/Start `shortfin` LLM server
+After the `shortfin` LLM Server has started, we must obtain the base_url.
+We will store this in our environment in order to send request to `shortfin`
+ through the `sglang` client examples below.
 
-Follow the steps [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/llama_end_to_end.md)
-to export a model with `sharktank` and start a `shortfin` LLM server
-with that model.
+```bash
+export SHORTFIN_BASE_URL="SHORTFIN_BASE_URL" # example: http://localhost:8000
+```
 
 ## Install sglang
 
@@ -48,6 +50,8 @@ We can use pip to install it in the same virtual environment that we used
 to start our Shortfin LLM Server.
 
 ```bash
+python -m venv --prompt shark-ai .venv
+source .venv/bin/activate
 pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 ```
 
@@ -56,8 +60,23 @@ pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 You can verify the installation/setup through the following examples:
 
 - [Multi-Turn Q&A Example](#multi-turn-qa-example)
+- [Streaming Example](#streaming-example)
 - [Fork Example](#fork-example)
-- [Benchmark Shortfin](#bench-mark-shortfin-w-sglang-bench_serving-script)
+- [Multi-Turn Q&A Batching Example](#multi-turn-qa-batch-example)
+
+In these examples, we will set our `max_tokens` to 50 when generating completions.
+This details how many tokens we want to generate for each completion.
+
+We can modify the arguments passed to `sgl.gen` to alter the outputs of our
+`shortfin` LLM server. Specifically:
+
+- `max_tokens` - The maximum number of tokens to generate for completion.
+                 We may obtain longer responses by increasing this value,
+                 and shorter responses by decreasing it.
+- `temperature` - We can include a temperature parameter to control the
+                  randomness of the generated completions. A higher value
+                  will result in more randomness, while a lower value will
+                  result in more deterministic completions.
 
 ## Multi-Turn Q&A example
 
@@ -75,20 +94,24 @@ python
 You can copy and paste the following example into your interpreter:
 
 ```python
+import os
+
 import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000", ) # Change base_url if running at different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 sgl.set_default_backend(backend)
 
 @sgl.function
 def multi_turn_question(s, question_1, question_2):
      s += sgl.user(question_1)
-     s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+     s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
      s += sgl.user(question_2)
-     s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+     s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
 
 state = multi_turn_question.run(question_1="Name the capital city of the USA.", question_2="The Smithsonian is in this location.")
 
@@ -96,40 +119,64 @@ for m in state.messages():
     print(m["role"], m["content"])
 ```
 
-### Shortfin example output
+## Streaming Example
 
-You should see an output similar to this:
+We can stream our request for a more responsive feel. Let's invoke a `streaming` Q&A from our server:
 
-```text
-========== single ==========
+```python
+import os
 
-user : Name the capital city of the USA
-assistant : The capital city of the United States of America is Washington, D.C. (short for District of Columbia).
-user : The Smithsonian is in this location.
-assistant : The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes.
-```
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
 
-## Fork example
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
 
-Now that we have sglang installed, we can run an example to show a `fork`
-flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html):
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
-### Open python interpreter
+sgl.set_default_backend(backend)
 
-```bash
-python
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
+
+question_1 = "Name the capital city of the USA."
+question_2 = "The Smithsonian is in this location."
+
+# Run the multi-turn question function with streaming enabled
+state = multi_turn_question.run(
+    question_1=question_1,
+    question_2=question_2,
+    stream=True,
+)
+
+# Collect messages from the streamed output
+messages = ""
+
+for chunk in state.text_iter():
+    messages += chunk
+
+print(messages)
 ```
 
-### Run example
 
-You can copy and paste the following example into your interpreter:
+## Fork example
+
+We can also send different pieces of the same prompt in parallel using the `fork`
+flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html):
 
 ```python
+import os
+
 import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000") # Change base_url if running at different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 sgl.set_default_backend(backend)
 
@@ -142,7 +189,7 @@ def tip_suggestion(s):
     forks = s.fork(2)
     for i, f in enumerate(forks):
         f += f"Now, expand tip {i+1} into a paragraph:\n"
-        f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
+        f += sgl.gen(f"detailed_tip", max_tokens=50, stop="\n\n")
     s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
     s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
     s += "In summary" + sgl.gen("summary")
@@ -152,103 +199,64 @@ state = tip_suggestion.run()
 print(state.text())
 ```
 
-### Shortfin example output
-
-You should see an output similar to this:
-
-```text
-Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise.
-
-Tip 1:A balanced diet is important for maintaining good health. It should
-include a variety of foods from all the major food groups, such as fruits,
-vegetables, grains, proteins, and dairy. Eating a balanced diet can help
-prevent chronic diseases such as heart disease, diabetes, and obesity.
-
-Now, expand tip 2 into a paragraph:
-Regular exercise is also important for maintaining good health. It can help
-improve cardiovascular health, strengthen muscles and bones, and reduce the
-risk of chronic diseases. Exercise can also help improve mental health by
-reducing stress and anxiety. It is recommended that adults get at least 150
-minutes of moderate-intensity exercise or 75 minutes of vigorous-intensity
-exercise per week.
-
-Now, combine the two paragraphs into a single paragraph:
-A balanced diet and regular exercise are both important for maintaining good
-health. A balanced diet should include a variety of foods from all the major
-food groups, such as fruits, vegetables, grains, proteins, and dairy.
-Eating a balanced diet can help prevent chronic diseases such as heart disease,
-diabetes, and obesity. Regular exercise is also important for maintaining good
-health. It can help improve cardiovascular health, strengthen muscles and bones,
-and reduce the risk of chronic diseases. Exercise can also help improve mental
-health by reducing stress and anxiety. It is recommended that
-
-Tip 2:Regular exercise is important for maintaining a healthy body and mind.
-It can help improve cardiovascular health, strengthen muscles and bones,
-and reduce the risk of chronic diseases such as diabetes and heart disease.
-Additionally, exercise has been shown to improve mood, reduce stress,
-and increase overall well-being. It is recommended that adults engage in
-at least 150 minutes of moderate-intensity aerobic activity or 75 minutes of
-vigorous-intensity aerobic activity per week, as well as strength training
-exercises at least two days per week.
-
-In summary, a balanced diet and regular exercise are both essential for
-maintaining good health. A balanced diet should include a variety of foods from
-all the major food groups, while regular exercise can help improve
-cardiovascular health, strengthen muscles and bones, reduce the risk of
-chronic diseases, and improve mental health. It is recommended that adults
-engage in at least 150 minutes of moderate-intensity aerobic activity or
-75 minutes of vigorous-intensity aerobic activity per week,
-as well as strength training exercises at least two days per week.
-```
+## Multi-Turn Q&A Batch Example
 
-## Benchmark shortfin w/ sglang `bench_serving` script
+With **Shortfin** + SGLang, we can also easily send requests as a batch.
+Let's now invoke a `batched` Q&A flow with the SGLang [Batching](https://sgl-project.github.io/frontend/frontend.html#batching):
 
-We can obtain benchmarking metrics using the `bench_serving` script
-provided by SGLang:
+```python
+import os
 
-**NOTE: Change `--base-url` if running at a different address**
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
 
-```bash
-python -m sglang.bench_serving --backend shortfin --num-prompt 10 --base-url http://localhost:8000 --tokenizer /path/to/tokenizer/dir --request-rate 1
-```
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
+
+# Set the default backend for sglang
+sgl.set_default_backend(backend)
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
+
+# Define the questions for the first and second sets
+question_1_1 = "Name the capital city of the USA."
+question_1_2 = "The Smithsonian is in this location."
+question_2_1 = "Name the largest city in the USA."
+question_2_2 = "The Empire State Building is in this location."
+
+# Run the multi-turn question function in batch mode
+states = multi_turn_question.run_batch(
+    [
+        {
+            "question_1": question_1_1,
+            "question_2": question_1_2,
+        },
+        {
+            "question_1": question_2_1,
+            "question_2": question_2_2,
+        },
+    ]
+)
+
+# Extract responses from the states
+first_qa = states[0]
+second_qa = states[1]
+
+first_qa_messages = first_qa.messages()
+second_qa_messages = second_qa.messages()
+
+# Print messages from the first QA session
+for m in first_qa_messages:
+    print(m["role"], m["content"])
+
+# Print messages from the second QA session
+for m in second_qa_messages:
+    print(m["role"], m["content"])
 
-There are some more metrics captured, but the most relevant are the following:
-
-- E2E Latency
-- TTFT (Time to First Token)
-- TPOT (Time per Output Token)
-- ITL (Inter-Token Latency)
-- Request Throughput
-- Benchmark Duration
-
-When complete, you should see an output similar to this:
-
-```text
-============ Serving Benchmark Result ============
-Backend:                                 shortfin
-Traffic request rate:                    1.0
-Successful requests:                     10
-Benchmark duration (s):                  427.91
-Total input tokens:                      1960
-Total generated tokens:                  2774
-Total generated tokens (retokenized):    63
-Request throughput (req/s):              0.02
-Input token throughput (tok/s):          4.58
-Output token throughput (tok/s):         6.48
-----------------End-to-End Latency----------------
-Mean E2E Latency (ms):                   416268.77
-Median E2E Latency (ms):                 417159.14
----------------Time to First Token----------------
-Mean TTFT (ms):                          292404.29
-Median TTFT (ms):                        365989.01
-P99 TTFT (ms):                           367325.63
------Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          1359.41
-Median TPOT (ms):                        163.96
-P99 TPOT (ms):                           6316.12
----------------Inter-token Latency----------------
-Mean ITL (ms):                           2238.99
-Median ITL (ms):                         958.75
-P99 ITL (ms):                            2719.50
-==================================================
 ```
diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
new file mode 100644
index 000000000..08a22aa3d
--- /dev/null
+++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -0,0 +1,59 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: shark-llama-app-deployment
+spec:
+  replicas: 4 # number of server instances
+  selector:
+    matchLabels:
+      app: shark-llama-app
+  template:
+    metadata:
+      labels:
+        app: shark-llama-app
+    spec:
+      containers:
+      - name: shark-llama-app-container
+        image: rocm/dev-ubuntu-22.04:6.3
+        command: ["/bin/bash", "-c"]
+        # update to artifacts you generated form llama_serving.md (this is an example with the base llama3.1 8b tp1 artifacts)
+        # change cli flags for instantiation of server to match your intended llama configuration
+        args:
+        - |
+          sudo apt update &&
+          curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash &&
+          sudo apt install git -y &&
+          sudo apt install python3.11 python3.11-dev python3.11-venv -y &&
+          sudo apt-get install wget -y &&
+          python3.11 -m venv shark_venv && source shark_venv/bin/activate &&
+          mkdir shark_artifacts &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/config.json -O shark_artifacts/config.json &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/meta-llama-3.1-8b-instruct.f16.gguf -O shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/model.vmfb -O shark_artifacts/model.vmfb &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer_config.json -O shark_artifacts/tokenizer_config.json &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer.json -O shark_artifacts/tokenizer.json &&
+          pip install --pre  shortfin[apps] -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels &&
+          pip install pandas &&
+          python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device=hip;
+        resources:
+          # change number of gpus required here based on your llama configuration
+          requests:
+            amd.com/gpu: 1
+          limits:
+            amd.com/gpu: 1
+      restartPolicy: Always
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: shark-llama-app-service
+spec:
+  selector:
+    app: shark-llama-app
+  ports:
+  - protocol: TCP
+    port: 80 # external port
+    targetPort: 8000 # port the container exposes
+  type: LoadBalancer