forked from getumbrel/llama-gpt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-mac.sh
executable file
·253 lines (219 loc) · 8.22 KB
/
run-mac.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/bin/bash
set -e
# Define a function to refresh the source of .zshrc or .bashrc
source_shell_rc() {
# Source .zshrc or .bashrc
if [ -f ~/.zshrc ]; then
source ~/.zshrc
elif [ -f ~/.bashrc ]; then
source ~/.bashrc
else
echo "No .bashrc or .zshrc file found."
fi
}
# Define a function to install conda with Miniforge3
install_conda() {
# Download Miniforge3
curl -L -o /tmp/Miniforge3.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
bash /tmp/Miniforge3.sh
source_shell_rc
}
# Define a function to install a specific version of llama-cpp-python
install_llama_cpp_python() {
local model_type=$1
local version=$2
local installed_version=$(pip3 show llama-cpp-python | grep -i version | awk '{print $2}')
if [[ "$installed_version" != "$version" ]]; then
echo "llama-cpp-python version is not $version. Installing $version version for $model_type support..."
pip3 uninstall llama-cpp-python -y
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip3 install llama-cpp-python==$version --no-cache-dir
pip3 install 'llama-cpp-python[server]'
else
echo "llama-cpp-python version is $version."
fi
}
source_shell_rc
# Check if the platform is MacOS and the architecture is arm64
if [[ "$(uname)" != "Darwin" ]] || [[ "$(uname -m)" != "arm64" ]]; then
echo "This script is intended to be run on MacOS with M1/M2 chips. Exiting..."
exit 1
fi
# Check if Docker is installed
if ! command -v docker &> /dev/null; then
echo "Docker is not installed. Exiting..."
exit 1
fi
# Check if python3 is installed
if ! command -v python3 &> /dev/null; then
echo "Python3 is not installed. Exiting..."
exit 1
fi
# Check if Xcode is installed
xcode_path=$(xcode-select -p 2>/dev/null)
if [ -z "$xcode_path" ]; then
echo "Xcode is not installed. Installing (this may take a long time)..."
xcode-select --install
else
echo "Xcode is installed at $xcode_path"
fi
# Check if conda is installed
if ! command -v conda &> /dev/null; then
echo "Conda is not installed. Installing Miniforge3 which includes conda..."
install_conda
else
echo "Conda is installed."
# TODO: Check if the conda version for MacOS that supports Metal GPU is installed
# conda_version=$(conda --version)
# if [[ $conda_version != *"Miniforge3"* ]]; then
# echo "Conda version that supports Metal GPU is not installed. Installing..."
# install_conda
# else
# echo "Conda version that supports M1/M2 is installed."
# fi
fi
# Check if the conda environment 'llama-gpt' exists
if conda env list | grep -q 'llama-gpt'; then
echo "Conda environment 'llama-gpt' already exists."
else
echo "Creating a conda environment called 'llama-gpt'..."
conda create -n llama-gpt python=$(python3 --version | cut -d ' ' -f 2)
fi
# Check if the conda environment 'llama-gpt' is active
if [[ "$(conda info --envs | grep '*' | awk '{print $1}')" != "llama-gpt" ]]; then
echo "Activating the conda environment 'llama-gpt'..."
conda activate llama-gpt
else
echo "Conda environment 'llama-gpt' is already active."
fi
# Parse command line arguments for --model
while [[ "$#" -gt 0 ]]; do
case $1 in
--model) MODEL="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done
# If no model is passed, default to 7b model
if [[ -z "$MODEL" ]]; then
echo "No model value provided. Defaulting to 7b. If you want to change the model, exit the script and use --model to provide the model value."
echo "Supported models are 7b, 13b, 70b, code-7b, code-13b, code-34b."
MODEL="7b"
fi
# Get the number of available CPU cores and subtract 2
n_threads=$(($(sysctl -n hw.logicalcpu) - 2))
# Define context window
n_ctx=4096
# Define batch size
n_batch=2096
# Define number of GPU layers
n_gpu_layers=1
# Define grouping factor
N_GQA=1
model_type="gguf"
# Set values for MODEL and MODEL_DOWNLOAD_URL based on the model passed
case $MODEL in
7b)
MODEL="./models/llama-2-7b-chat.bin"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin"
model_type="ggml"
;;
13b)
MODEL="./models/llama-2-13b-chat.bin"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Nous-Hermes-Llama2-GGML/resolve/main/nous-hermes-llama2-13b.ggmlv3.q4_0.bin"
model_type="ggml"
n_gpu_layers=2
;;
70b)
MODEL="./models/llama-2-70b-chat.bin"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGML/resolve/main/llama-2-70b-chat.ggmlv3.q4_0.bin"
model_type="ggml"
n_gpu_layers=3
# Llama 2 70B's grouping factor is 8 compared to 7B and 13B's 1.
N_GQA=8
;;
code-7b)
MODEL="./models/code-llama-7b-chat.gguf"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q4_K_M.gguf"
DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
n_gpu_layers=1
n_ctx=8192
;;
code-13b)
MODEL="./models/code-llama-13b-chat.gguf"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf"
DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
n_gpu_layers=2
n_ctx=8192
;;
code-34b)
MODEL="./models/code-llama-34b-chat.gguf"
MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v1-GGUF/resolve/main/phind-codellama-34b-v1.Q4_K_M.gguf"
DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
n_gpu_layers=3
n_ctx=8192
# Code Llama 34B's grouping factor is 8 compared to 7B and 13B's 1.
N_GQA=8
;;
*)
echo "Invalid model passed: $MODEL"; exit 1
;;
esac
# Check if llama-cpp-python is already installed
llama_cpp_python_installed=$(pip3 list | grep -q llama-cpp-python && echo "installed" || echo "not installed")
if [[ "$llama_cpp_python_installed" == "not installed" ]]; then
echo "llama-cpp-python is not installed. Installing..."
if [[ "$model_type" == "ggml" ]]; then
install_llama_cpp_python "GGML" "0.1.78"
else
install_llama_cpp_python "GGUF" "0.1.80"
fi
else
echo "llama-cpp-python is installed."
if [[ "$model_type" == "ggml" ]]; then
install_llama_cpp_python "GGML" "0.1.78"
else
install_llama_cpp_python "GGUF" "0.1.80"
fi
fi
# Check if the model file exists
if [ ! -f $MODEL ]; then
echo "Model file not found. Downloading..."
# Download the model file with a custom progress bar showing percentage, download speed, downloaded, total size, and estimated time remaining
curl -L -o $MODEL $MODEL_DOWNLOAD_URL
if [ $? -ne 0 ]; then
echo "Download failed. Trying with TLS 1.2..."
curl -L --tlsv1.2 -o $MODEL $MODEL_DOWNLOAD_URL
fi
else
echo "$MODEL model found."
fi
# Display configuration information
echo "Initializing server with:"
echo "Batch size: $n_batch"
echo "Number of CPU threads: $n_threads"
echo "Number of GPU layers: $n_gpu_layers"
echo "Context window: $n_ctx"
echo "GQA: $n_gqa"
# Export environment variables
export MODEL
export N_GQA
export DEFAULT_SYSTEM_PROMPT
# Run docker-compose with the macOS yml file
docker compose -f ./docker-compose-mac.yml up --remove-orphans --build &
# Run the server
python3 -m llama_cpp.server --n_ctx $n_ctx --n_threads $n_threads --n_gpu_layers $n_gpu_layers --n_batch $n_batch --model $MODEL --port 3001 &
# Define a function to stop docker-compose and the python3 command
stop_commands() {
echo "Stopping docker-compose..."
docker compose -f ./docker-compose-mac.yml down
echo "Stopping python server..."
pkill -f "python3 -m llama_cpp.server"
echo "Deactivating conda environment..."
conda deactivate
echo "All processes stopped."
}
# Set a trap to catch SIGINT and stop the commands
trap stop_commands SIGINT
# Wait for both commands to finish
wait $DOCKER_COMPOSE_PID
wait $PYTHON_PID