Skip to content

Commit

Permalink
workin?
Browse files Browse the repository at this point in the history
  • Loading branch information
Sven Cattell committed Jan 8, 2024
1 parent ffcf431 commit ca03a2b
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 8 deletions.
24 changes: 24 additions & 0 deletions kube/create_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

gcloud config set project hoth-410100
export PROJECT_ID=$(gcloud config get project)
export REGION=asia-east1

gcloud container clusters create hoth-demo --location ${REGION} \
--workload-pool ${PROJECT_ID}.svc.id.goog \
--enable-image-streaming \
--node-locations=$REGION-a \
--workload-pool=${PROJECT_ID}.svc.id.goog \
--addons GcsFuseCsiDriver \
--machine-type n2d-standard-4 \
--num-nodes 1 --min-nodes 1 --max-nodes 5 \
--ephemeral-storage-local-ssd=count=2


gcloud container node-pools create p100-test --cluster hoth-demo \
--accelerator type=nvidia-tesla-p100,count=1,gpu-driver-version=latest \
--machine-type n1-standard-8 \
--ephemeral-storage-local-ssd=count=1 \
--enable-autoscaling --enable-image-streaming \
--num-nodes=0 --min-nodes=0 --max-nodes=20 \
--node-locations $REGION-a,$REGION-c --region $REGION --spot
4 changes: 2 additions & 2 deletions kube/llm-service.yaml → kube/falcon-service.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
apiVersion: v1
kind: Service
metadata:
name: llm-service
name: falcon-service
spec:
selector:
app: llm
app: falcon-7b
type: ClusterIP
ports:
- protocol: TCP
Expand Down
44 changes: 44 additions & 0 deletions kube/falcon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: falcon-7b
spec:
replicas: 1
selector:
matchLabels:
app: falcon-7b
template:
metadata:
labels:
app: falcon-7b
spec:
containers:
- name: llm
image: ghcr.io/huggingface/text-generation-inference:1.3.4
resources:
limits:
nvidia.com/gpu: "1"
env:
- name: MODEL_ID
value: OpenAssistant/falcon-7b-sft-top1-696
- name: NUM_SHARD
value: "1"
- name: PORT
value: "8080"
- name: QUANTIZE
value: bitsandbytes-nf4
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /data
name: data
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: data
emptyDir: {}
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-p100
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
cloud.google.com/gke-spot: "true"
12 changes: 6 additions & 6 deletions kube/llama2.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm
name: llama2-7b
spec:
replicas: 1
selector:
matchLabels:
app: llm
app: llama2-7b
template:
metadata:
labels:
app: llm
app: llama2-7b
spec:
containers:
- name: llm
image: ghcr.io/huggingface/text-generation-inference:1.1.0
resources:
limits:
nvidia.com/gpu: "2"
nvidia.com/gpu: "1"
env:
- name: MODEL_ID
value: meta-llama/Llama-2-7b-chat-hf
Expand All @@ -26,7 +26,7 @@ spec:
- name: PORT
value: "8080"
- name: QUANTIZE
value: bitsandbytes-nf8
value: bitsandbytes-nf4
volumeMounts:
- mountPath: /dev/shm
name: dshm
Expand All @@ -39,6 +39,6 @@ spec:
- name: data
emptyDir: {}
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
cloud.google.com/gke-accelerator: nvidia-tesla-p100
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
cloud.google.com/gke-spot: "true"
12 changes: 12 additions & 0 deletions kube/mistral-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: mistral-service
spec:
selector:
app: mistral-7b
type: ClusterIP
ports:
- protocol: TCP
port: 80
targetPort: 8080
44 changes: 44 additions & 0 deletions kube/mistral.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: mistral-7b
spec:
replicas: 1
selector:
matchLabels:
app: mistral-7b
template:
metadata:
labels:
app: mistral-7b
spec:
containers:
- name: llm
image: ghcr.io/huggingface/text-generation-inference:1.3.4
resources:
limits:
nvidia.com/gpu: "1"
env:
- name: MODEL_ID
value: mistralai/Mistral-7B-Instruct-v0.2
- name: NUM_SHARD
value: "1"
- name: PORT
value: "8080"
- name: QUANTIZE
value: bitsandbytes-nf4
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /data
name: data
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: data
emptyDir: {}
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-p100
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
cloud.google.com/gke-spot: "true"
3 changes: 3 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ test:
publish_dev:
docker buildx build --platform linux/amd64,linux/arm64,linux/arm/v7 --file dockerfiles/Dockerfile -t aivillage/llm_router:dev --push .

publish:
docker buildx build --platform linux/amd64,linux/arm64,linux/arm/v7 --file dockerfiles/Dockerfile -t aivillage/llm_router:latest --push .

lockfile:
cargo generate-lockfile

0 comments on commit ca03a2b

Please sign in to comment.