run-llama · marcusschiesser · Nov 4, 2024 · Oct 28, 2024 · Oct 29, 2024 · Nov 4, 2024
diff --git a/.changeset/plenty-pumpkins-fold.md b/.changeset/plenty-pumpkins-fold.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add local models via Hugging Face; use Sentence Transformers w. ONNX instead of FastEmbed (support for more models, etc)
diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts
@@ -336,6 +336,20 @@ const getModelEnvs = (modelConfig: ModelConfig): EnvVar[] => {
           },
         ]
       : []),
+    ...(modelConfig.provider === "huggingface"
+      ? [
+          {
+            name: "EMBEDDING_BACKEND",
+            description:
+              "The backend to use for the Sentence Transformers embedding model, either 'torch', 'onnx', or 'openvino'. Defaults to 'onnx'.",
+          },
+          {
+            name: "EMBEDDING_TRUST_REMOTE_CODE",
+            description:
+              "Whether to trust remote code for the embedding model, required for some models with custom code.",
+          },
+        ]
+      : []),
     ...(modelConfig.provider === "t-systems"
       ? [
           {

diff --git a/helpers/providers/huggingface.ts b/helpers/providers/huggingface.ts
@@ -0,0 +1,61 @@
+import prompts from "prompts";
+import { ModelConfigParams } from ".";
+import { questionHandlers, toChoice } from "../../questions/utils";
+
+const MODELS = ["HuggingFaceH4/zephyr-7b-alpha"];
+type ModelData = {
+  dimensions: number;
+};
+const EMBEDDING_MODELS: Record<string, ModelData> = {
+  "all-MiniLM-L6-v2": { dimensions: 384 },
+};
+
+const DEFAULT_MODEL = MODELS[0];
+const DEFAULT_EMBEDDING_MODEL = Object.keys(EMBEDDING_MODELS)[0];
+const DEFAULT_DIMENSIONS = Object.values(EMBEDDING_MODELS)[0].dimensions;
+
+type HuggingfaceQuestionsParams = {
+  askModels: boolean;
+};
+
+export async function askHuggingfaceQuestions({
+  askModels,
+}: HuggingfaceQuestionsParams): Promise<ModelConfigParams> {
+  const config: ModelConfigParams = {
+    model: DEFAULT_MODEL,
+    embeddingModel: DEFAULT_EMBEDDING_MODEL,
+    dimensions: DEFAULT_DIMENSIONS,
+    isConfigured(): boolean {
+      return true;
+    },
+  };
+
+  if (askModels) {
+    const { model } = await prompts(
+      {
+        type: "select",
+        name: "model",
+        message: "Which Hugging Face model would you like to use?",
+        choices: MODELS.map(toChoice),
+        initial: 0,
+      },
+      questionHandlers,
+    );
+    config.model = model;
+
+    const { embeddingModel } = await prompts(
+      {
+        type: "select",
+        name: "embeddingModel",
+        message: "Which embedding model would you like to use?",
+        choices: Object.keys(EMBEDDING_MODELS).map(toChoice),
+        initial: 0,
+      },
+      questionHandlers,
+    );
+    config.embeddingModel = embeddingModel;
+    config.dimensions = EMBEDDING_MODELS[embeddingModel].dimensions;
+  }
+
+  return config;
+}
diff --git a/helpers/providers/index.ts b/helpers/providers/index.ts
@@ -5,6 +5,7 @@ import { askAnthropicQuestions } from "./anthropic";
 import { askAzureQuestions } from "./azure";
 import { askGeminiQuestions } from "./gemini";
 import { askGroqQuestions } from "./groq";
+import { askHuggingfaceQuestions } from "./huggingface";
 import { askLLMHubQuestions } from "./llmhub";
 import { askMistralQuestions } from "./mistral";
 import { askOllamaQuestions } from "./ollama";
@@ -39,6 +40,7 @@ export async function askModelConfig({
 
     if (framework === "fastapi") {
       choices.push({ title: "T-Systems", value: "t-systems" });
+      choices.push({ title: "Huggingface", value: "huggingface" });
     }
     const { provider } = await prompts(
       {
@@ -76,6 +78,9 @@ export async function askModelConfig({
     case "t-systems":
       modelConfig = await askLLMHubQuestions({ askModels });
       break;
+    case "huggingface":
+      modelConfig = await askHuggingfaceQuestions({ askModels });
+      break;
     default:
       modelConfig = await askOpenAIQuestions({
         openAiKey,

diff --git a/helpers/python.ts b/helpers/python.ts
@@ -173,35 +173,23 @@ const getAdditionalDependencies = (
       }
       break;
     case "groq":
-      // Fastembed==0.2.0 does not support python3.13 at the moment
-      // Fixed the python version less than 3.13
-      dependencies.push({
-        name: "python",
-        version: "^3.11,<3.13",
-      });
       dependencies.push({
         name: "llama-index-llms-groq",
         version: "0.2.0",
       });
       dependencies.push({
-        name: "llama-index-embeddings-fastembed",
-        version: "^0.2.0",
+        name: "llama-index-embeddings-huggingface",
+        version: "^0.3.1",
       });
       break;
     case "anthropic":
-      // Fastembed==0.2.0 does not support python3.13 at the moment
-      // Fixed the python version less than 3.13
-      dependencies.push({
-        name: "python",
-        version: "^3.11,<3.13",
-      });
       dependencies.push({
         name: "llama-index-llms-anthropic",
         version: "0.3.0",
       });
       dependencies.push({
-        name: "llama-index-embeddings-fastembed",
-        version: "^0.2.0",
+        name: "llama-index-embeddings-huggingface",
+        version: "^0.3.1",
       });
       break;
     case "gemini":
@@ -234,6 +222,16 @@ const getAdditionalDependencies = (
         version: "0.2.4",
       });
       break;
+    case "huggingface":
+      dependencies.push({
+        name: "llama-index-llms-huggingface",
+        version: "^0.3.5",
+      });
+      dependencies.push({
+        name: "llama-index-embeddings-huggingface",
+        version: "^0.3.1",
+      });
+      break;
     case "t-systems":
       dependencies.push({
         name: "llama-index-agent-openai",

diff --git a/helpers/types.ts b/helpers/types.ts
@@ -9,6 +9,7 @@ export type ModelProvider =
   | "gemini"
   | "mistral"
   | "azure-openai"
+  | "huggingface"
   | "t-systems";
 export type ModelConfig = {
   provider: ModelProvider;

diff --git a/templates/components/settings/python/settings.py b/templates/components/settings/python/settings.py
@@ -21,6 +21,8 @@ def init_settings():
             init_mistral()
         case "azure-openai":
             init_azure_openai()
+        case "huggingface":
+            init_huggingface()
         case "t-systems":
             from .llmhub import init_llmhub
 
@@ -113,29 +115,40 @@ def init_azure_openai():
     )
 
 
-def init_fastembed():
+def init_huggingface_embedding():
     try:
-        from llama_index.embeddings.fastembed import FastEmbedEmbedding
+        from llama_index.embeddings.huggingface import HuggingFaceEmbedding
     except ImportError:
         raise ImportError(
-            "FastEmbed support is not installed. Please install it with `poetry add llama-index-embeddings-fastembed`"
+            "Hugging Face support is not installed. Please install it with `poetry add llama-index-embeddings-huggingface`"
         )
 
-    embed_model_map: Dict[str, str] = {
-        # Small and multilingual
-        "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
-        # Large and multilingual
-        "paraphrase-multilingual-mpnet-base-v2": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
-    }
+    embedding_model = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+    backend = os.getenv("EMBEDDING_BACKEND", "onnx")  # "torch", "onnx", or "openvino"
+    trust_remote_code = (
+        os.getenv("EMBEDDING_TRUST_REMOTE_CODE", "false").lower() == "true"
+    )
+
+    Settings.embed_model = HuggingFaceEmbedding(
+        model_name=embedding_model,
+        trust_remote_code=trust_remote_code,
+        backend=backend,
+    )
+
 
-    embedding_model = os.getenv("EMBEDDING_MODEL")
-    if embedding_model is None:
-        raise ValueError("EMBEDDING_MODEL environment variable is not set")
+def init_huggingface():
+    try:
+        from llama_index.llms.huggingface import HuggingFaceLLM
+    except ImportError:
+        raise ImportError(
+            "Hugging Face support is not installed. Please install it with `poetry add llama-index-llms-huggingface` and `poetry add llama-index-embeddings-huggingface`"
+        )
 
-    # This will download the model automatically if it is not already downloaded
-    Settings.embed_model = FastEmbedEmbedding(
-        model_name=embed_model_map[embedding_model]
+    Settings.llm = HuggingFaceLLM(
+        model_name=os.getenv("MODEL"),
+        tokenizer_name=os.getenv("MODEL"),
     )
+    init_huggingface_embedding()
 
 
 def init_groq():
@@ -147,8 +160,8 @@ def init_groq():
         )
 
     Settings.llm = Groq(model=os.getenv("MODEL"))
-    # Groq does not provide embeddings, so we use FastEmbed instead
-    init_fastembed()
+    # Groq does not provide embeddings, so we use open Sentence Transformer models instead
+    init_huggingface_embedding()
 
 
 def init_anthropic():
@@ -168,8 +181,8 @@ def init_anthropic():
     }
 
     Settings.llm = Anthropic(model=model_map[os.getenv("MODEL")])
-    # Anthropic does not provide embeddings, so we use FastEmbed instead
-    init_fastembed()
+    # Anthropic does not provide embeddings, so we use open Sentence Transformer models instead
+    init_huggingface_embedding()
 
 
 def init_gemini():