workin?

aivillage · Jan 8, 2024 · ca03a2b · ca03a2b
1 parent ffcf431
commit ca03a2b
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 8 deletions.
diff --git a/kube/create_cluster.sh b/kube/create_cluster.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+gcloud config set project hoth-410100 
+export PROJECT_ID=$(gcloud config get project)
+export REGION=asia-east1
+
+gcloud container clusters create hoth-demo --location ${REGION} \
+  --workload-pool ${PROJECT_ID}.svc.id.goog \
+  --enable-image-streaming \
+  --node-locations=$REGION-a \
+  --workload-pool=${PROJECT_ID}.svc.id.goog \
+  --addons GcsFuseCsiDriver   \
+  --machine-type n2d-standard-4 \
+  --num-nodes 1 --min-nodes 1 --max-nodes 5 \
+  --ephemeral-storage-local-ssd=count=2
+
+
+gcloud container node-pools create p100-test --cluster hoth-demo \
+  --accelerator type=nvidia-tesla-p100,count=1,gpu-driver-version=latest \
+  --machine-type n1-standard-8 \
+  --ephemeral-storage-local-ssd=count=1 \
+  --enable-autoscaling --enable-image-streaming \
+  --num-nodes=0 --min-nodes=0 --max-nodes=20 \
+  --node-locations $REGION-a,$REGION-c --region $REGION --spot
diff --git a/kube/llm-service.yaml → kube/falcon-service.yaml b/kube/llm-service.yaml → kube/falcon-service.yaml
@@ -1,10 +1,10 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: llm-service
+  name: falcon-service
 spec:
   selector:
-    app: llm
+    app: falcon-7b
   type: ClusterIP
   ports:
     - protocol: TCP

diff --git a/kube/falcon.yaml b/kube/falcon.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon-7b
+  template:
+    metadata:
+      labels:
+        app: falcon-7b
+    spec:
+      containers:
+      - name: llm
+        image: ghcr.io/huggingface/text-generation-inference:1.3.4
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+        env:
+        - name: MODEL_ID
+          value: OpenAssistant/falcon-7b-sft-top1-696
+        - name: NUM_SHARD
+          value: "1"
+        - name: PORT
+          value: "8080"
+        - name: QUANTIZE
+          value: bitsandbytes-nf4
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /data
+            name: data
+      volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: data
+          emptyDir: {}
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-p100
+        cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+        cloud.google.com/gke-spot: "true"
diff --git a/kube/llama2.yaml b/kube/llama2.yaml
@@ -1,23 +1,23 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: llm
+  name: llama2-7b
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: llm
+      app: llama2-7b
   template:
     metadata:
       labels:
-        app: llm
+        app: llama2-7b
     spec:
       containers:
       - name: llm
         image: ghcr.io/huggingface/text-generation-inference:1.1.0
         resources:
           limits:
-            nvidia.com/gpu: "2"
+            nvidia.com/gpu: "1"
         env:
         - name: MODEL_ID
           value: meta-llama/Llama-2-7b-chat-hf
@@ -26,7 +26,7 @@ spec:
         - name: PORT
           value: "8080"
         - name: QUANTIZE
-          value: bitsandbytes-nf8
+          value: bitsandbytes-nf4
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
@@ -39,6 +39,6 @@ spec:
         - name: data
           emptyDir: {}
       nodeSelector:
-        cloud.google.com/gke-accelerator: nvidia-l4
+        cloud.google.com/gke-accelerator: nvidia-tesla-p100
         cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
         cloud.google.com/gke-spot: "true"
diff --git a/kube/mistral-service.yaml b/kube/mistral-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: mistral-service
+spec:
+  selector:
+    app: mistral-7b
+  type: ClusterIP
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 8080
diff --git a/kube/mistral.yaml b/kube/mistral.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      containers:
+      - name: llm
+        image: ghcr.io/huggingface/text-generation-inference:1.3.4
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+        env:
+        - name: MODEL_ID
+          value: mistralai/Mistral-7B-Instruct-v0.2
+        - name: NUM_SHARD
+          value: "1"
+        - name: PORT
+          value: "8080"
+        - name: QUANTIZE
+          value: bitsandbytes-nf4
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /data
+            name: data
+      volumes:
+        - name: dshm
+          emptyDir:
+              medium: Memory
+        - name: data
+          emptyDir: {}
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-p100
+        cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+        cloud.google.com/gke-spot: "true"
diff --git a/makefile b/makefile
@@ -10,5 +10,8 @@ test:
 publish_dev:
 	docker buildx build --platform linux/amd64,linux/arm64,linux/arm/v7 --file dockerfiles/Dockerfile -t aivillage/llm_router:dev --push .
 
+publish:
+	docker buildx build --platform linux/amd64,linux/arm64,linux/arm/v7 --file dockerfiles/Dockerfile -t aivillage/llm_router:latest --push .
+
 lockfile:
 	cargo generate-lockfile