ggerganov · phymbert · Mar 17, 2024 · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -39,6 +39,7 @@ jobs:
         id: make_build
         env:
             LLAMA_FATAL_WARNINGS: 1
+            LLAMA_USE_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -57,7 +57,8 @@ jobs:
             cmake \
             python3-pip \
             wget \
-            language-pack-en
+            language-pack-en \
+            libcurl4-openssl-dev
 
       - name: Build
         id: cmake_build

diff --git a/Makefile b/Makefile
@@ -595,6 +595,11 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
+ifdef LLAMA_USE_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+
 #
 # Print build information
 #

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -47,6 +47,16 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+# Check for curl
+find_package(CURL QUIET)
+if (CURL_FOUND)
+    add_definitions(-DLLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    link_libraries(${CURL_LIBRARIES})
+else()
+    message(INFO "libcurl not found. Building without model download support.")
+endif ()
+
 
 set(TARGET common)
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -16,6 +16,9 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
+#ifdef LLAMA_USE_CURL
+#include <curl/curl.h>
+#endif
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -531,6 +534,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "-mu" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
         } else if (arg == "-md" || arg == "--model-draft") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1131,6 +1140,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
     printf("                        draft model for speculative decoding\n");
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
@@ -1376,10 +1387,81 @@ void llama_batch_add(
     batch.n_tokens++;
 }
 
+#ifdef LLAMA_USE_CURL
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                                         struct llama_model_params     params) {
+    // Initialize libcurl
+    curl_global_init(CURL_GLOBAL_DEFAULT);
+    auto curl = curl_easy_init();
+
+    if (!curl) {
+        curl_global_cleanup();
+        fprintf(stderr, "%s: error initializing lib curl\n", __func__);
+        return nullptr;
+    }
+
+    // Set the URL
+    curl_easy_setopt(curl, CURLOPT_URL, model_url);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+
+    // Set the output file
+    auto outfile = fopen(path_model, "wb");
+    if (!outfile) {
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
+        return nullptr;
+    }
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+
+    // start the download
+    fprintf(stdout, "%s: downloading model from %s to %s ...\n", __func__, model_url, path_model);
+    auto res = curl_easy_perform(curl);
+    if (res != CURLE_OK) {
+        fclose(outfile);
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        return nullptr;
+    }
+
+    int http_code = 0;
+    curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+    if (http_code < 200 || http_code >= 400) {
+        fclose(outfile);
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: invalid http status code failed: %d\n", __func__, http_code);
+        return nullptr;
+    }
+
+    // Clean up
+    fclose(outfile);
+    curl_easy_cleanup(curl);
+    curl_global_cleanup();
+
+    return llama_load_model_from_file(path_model, params);
+}
+#else
+
+struct llama_model *llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
+                                              struct llama_model_params /*params*/) {
+    fprintf(stderr, "%s: llama.cpp built without curl support, downloading from an url not supported.\n", __func__);
+    return nullptr;
+}
+
+#endif
+
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = nullptr;
+    if (!params.model_url.empty()) {
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+    } else {
+        model = llama_load_model_from_file(params.model.c_str(), mparams);
+    }
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return std::make_tuple(nullptr, nullptr);

diff --git a/common/common.h b/common/common.h
@@ -17,6 +17,12 @@
 #include <unordered_map>
 #include <tuple>
 
+#ifdef HAVE_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -89,6 +95,7 @@ struct gpt_params {
     struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_url         = ""; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
@@ -191,6 +198,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                                         struct llama_model_params     params);
+
 // Batch utils
 
 void llama_batch_clear(struct llama_batch & batch);

diff --git a/examples/main/README.md b/examples/main/README.md
@@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     }
     printf("  -m FNAME, --model FNAME\n");
     printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
     printf("  -a ALIAS, --alias ALIAS\n");
     printf("                            set an alias for the model, will be added as `model` field in completion response\n");
     printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "-mu" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
         } else if (arg == "-a" || arg == "--alias") {
             if (++i >= argc) {
                 invalid_param = true;