[Model] Add Llama 3.1 to prebuilt models (mlc-ai#513)

Add Llama3.1 8B and 70B to prebuilt models. Update examples from Llama3 to Llama3.1 Llama 3 models are still kept. Related PRs: - mlc-ai/binary-mlc-llm-libs#131 - mlc-ai/mlc-llm#2682
jzhao62 · Dec 8, 2024 · e460e3c · e460e3c
1 parent 4f59b45
commit e460e3c
Show file tree

Hide file tree

Showing 14 changed files with 110 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -124,7 +124,7 @@ import { CreateMLCEngine } from "@mlc-ai/web-llm";
 const initProgressCallback = (initProgress) => {
   console.log(initProgress);
 }
-const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
 
 const engine = await CreateMLCEngine(
   selectedModel,

diff --git a/examples/get-started-web-worker/src/main.ts b/examples/get-started-web-worker/src/main.ts
@@ -17,7 +17,7 @@ async function mainNonStreaming() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
 
   const engine: webllm.MLCEngineInterface =
     await webllm.CreateWebWorkerMLCEngine(
@@ -56,7 +56,7 @@ async function mainStreaming() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
 
   const engine: webllm.MLCEngineInterface =
     await webllm.CreateWebWorkerMLCEngine(

diff --git a/examples/get-started/src/get_started.ts b/examples/get-started/src/get_started.ts
@@ -13,7 +13,7 @@ async function main() {
     setLabel("init-label", report.text);
   };
   // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
     {
@@ -32,12 +32,12 @@ async function main() {
   // const appConfig: webllm.AppConfig = {
   //   model_list: [
   //     {
-  //       model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC",
-  //       model_id: "Llama-3-8B-Instruct-q4f32_1-MLC",
+  //       model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
+  //       model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC",
   //       model_lib:
   //         webllm.modelLibURLPrefix +
   //         webllm.modelVersion +
-  //         "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+  //         "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
   //       overrides: {
   //         context_window_size: 2048,
   //       },
@@ -62,7 +62,7 @@ async function main() {
     n: 3,
     temperature: 1.5,
     max_tokens: 256,
-    // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3-8B-Instruct
+    // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
     // So we would have a higher chance of seeing the latter two, but never the first in the answer
     logit_bias: {
       "46510": -100,

diff --git a/examples/json-mode/src/json_mode.ts b/examples/json-mode/src/json_mode.ts
@@ -12,7 +12,7 @@ async function main() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
     { initProgressCallback: initProgressCallback },

diff --git a/examples/json-schema/src/json_schema.ts b/examples/json-schema/src/json_schema.ts
@@ -38,7 +38,7 @@ async function simpleStructuredTextExample() {
     setLabel("init-label", report.text);
   };
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
-    "Llama-3-8B-Instruct-q4f16_1-MLC",
+    "Llama-3.1-8B-Instruct-q4f16_1-MLC",
     { initProgressCallback: initProgressCallback },
   );
 
@@ -107,7 +107,7 @@ async function harryPotterExample() {
   };
 
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
-    "Llama-3-8B-Instruct-q4f16_1-MLC",
+    "Llama-3.1-8B-Instruct-q4f16_1-MLC",
     { initProgressCallback: initProgressCallback },
   );
 

diff --git a/examples/multi-round-chat/src/multi_round_chat.ts b/examples/multi-round-chat/src/multi_round_chat.ts
@@ -17,7 +17,7 @@ async function main() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
     { initProgressCallback: initProgressCallback },

diff --git a/examples/next-simple-chat/src/utils/chat_ui.ts b/examples/next-simple-chat/src/utils/chat_ui.ts
@@ -70,7 +70,7 @@ export default class ChatUI {
     this.engine.setInitProgressCallback(initProgressCallback);
 
     try {
-      const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+      const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
       // const selectedModel = "TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC-1k";
       await this.engine.reload(selectedModel);
     } catch (err: unknown) {

diff --git a/examples/seed-to-reproduce/src/seed.ts b/examples/seed-to-reproduce/src/seed.ts
@@ -18,7 +18,7 @@ async function main() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
     { initProgressCallback: initProgressCallback },

diff --git a/examples/service-worker/src/main.ts b/examples/service-worker/src/main.ts
@@ -37,7 +37,7 @@ async function mainNonStreaming() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
 
   const engine: webllm.MLCEngineInterface =
     await webllm.CreateServiceWorkerMLCEngine(selectedModel, {
@@ -75,7 +75,7 @@ async function mainStreaming() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
 
   const engine: webllm.ServiceWorkerMLCEngine =
     await webllm.CreateServiceWorkerMLCEngine(selectedModel, {

diff --git a/examples/simple-chat-js/index.js b/examples/simple-chat-js/index.js
@@ -11,7 +11,7 @@ const messages = [
 const availableModels = webllm.prebuiltAppConfig.model_list.map(
   (m) => m.model_id,
 );
-let selectedModel = "Llama-3-8B-Instruct-q4f32_1-1k";
+let selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-1k";
 
 // Callback function for initializing progress
 function updateEngineInitProgressCallback(report) {

diff --git a/examples/streaming/src/streaming.ts b/examples/streaming/src/streaming.ts
@@ -15,7 +15,7 @@ async function main() {
   const initProgressCallback = (report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   };
-  const selectedModel = "Llama-3-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
     { initProgressCallback: initProgressCallback },

diff --git a/src/config.ts b/src/config.ts
@@ -300,51 +300,51 @@ export const prebuiltAppConfig: AppConfig = {
   model_list: [
     // Llama-3
     {
-      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC",
-      model_id: "Llama-3-8B-Instruct-q4f32_1-MLC-1k",
+      model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
+      model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC-1k",
       model_lib:
         modelLibURLPrefix +
         modelVersion +
-        "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+        "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
       vram_required_MB: 5295.7,
       low_resource_required: true,
       overrides: {
         context_window_size: 1024,
       },
     },
     {
-      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
-      model_id: "Llama-3-8B-Instruct-q4f16_1-MLC-1k",
+      model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f16_1-MLC",
+      model_id: "Llama-3.1-8B-Instruct-q4f16_1-MLC-1k",
       model_lib:
         modelLibURLPrefix +
         modelVersion +
-        "/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
+        "/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
       vram_required_MB: 4598.34,
       low_resource_required: true,
       overrides: {
         context_window_size: 1024,
       },
     },
     {
-      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC",
-      model_id: "Llama-3-8B-Instruct-q4f32_1-MLC",
+      model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
+      model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC",
       model_lib:
         modelLibURLPrefix +
         modelVersion +
-        "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+        "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
       vram_required_MB: 6101.01,
       low_resource_required: false,
       overrides: {
         context_window_size: 4096,
       },
     },
     {
-      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
-      model_id: "Llama-3-8B-Instruct-q4f16_1-MLC",
+      model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f16_1-MLC",
+      model_id: "Llama-3.1-8B-Instruct-q4f16_1-MLC",
       model_lib:
         modelLibURLPrefix +
         modelVersion +
-        "/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
+        "/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
       vram_required_MB: 5001.0,
       low_resource_required: false,
       overrides: {
@@ -1031,7 +1031,73 @@ export const prebuiltAppConfig: AppConfig = {
       },
     },
     // BELOW ARE MODELS OF OLDER VERSIONS OR NOT AS PRACTICAL
-    // Llama-3 70B
+    // Llama-3.1 70B
+    {
+      model: "https://huggingface.co/mlc-ai/Llama-3.1-70B-Instruct-q3f16_1-MLC",
+      model_id: "Llama-3.1-70B-Instruct-q3f16_1-MLC",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Llama-3_1-70B-Instruct-q3f16_1-ctx4k_cs1k-webgpu.wasm",
+      vram_required_MB: 31153.13,
+      low_resource_required: false,
+      overrides: {
+        context_window_size: 4096,
+      },
+    },
+    // Llama-3
+    {
+      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC",
+      model_id: "Llama-3-8B-Instruct-q4f32_1-MLC-1k",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+      vram_required_MB: 5295.7,
+      low_resource_required: true,
+      overrides: {
+        context_window_size: 1024,
+      },
+    },
+    {
+      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
+      model_id: "Llama-3-8B-Instruct-q4f16_1-MLC-1k",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
+      vram_required_MB: 4598.34,
+      low_resource_required: true,
+      overrides: {
+        context_window_size: 1024,
+      },
+    },
+    {
+      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC",
+      model_id: "Llama-3-8B-Instruct-q4f32_1-MLC",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+      vram_required_MB: 6101.01,
+      low_resource_required: false,
+      overrides: {
+        context_window_size: 4096,
+      },
+    },
+    {
+      model: "https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
+      model_id: "Llama-3-8B-Instruct-q4f16_1-MLC",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm",
+      vram_required_MB: 5001.0,
+      low_resource_required: false,
+      overrides: {
+        context_window_size: 4096,
+      },
+    },
     {
       model: "https://huggingface.co/mlc-ai/Llama-3-70B-Instruct-q3f16_1-MLC",
       model_id: "Llama-3-70B-Instruct-q3f16_1-MLC",

diff --git a/src/engine.ts b/src/engine.ts
@@ -673,7 +673,7 @@ export class MLCEngine implements MLCEngineInterface {
         `WARNING: the current maxStorageBufferBindingSize ` +
           `(${computeMB(maxStorageBufferBindingSize)}) ` +
           `may only work for a limited number of models, e.g.: \n` +
-          `- Llama-3-8B-Instruct-q4f16_1-MLC-1k \n` +
+          `- Llama-3.1-8B-Instruct-q4f16_1-MLC-1k \n` +
           `- Llama-2-7b-chat-hf-q4f16_1-MLC-1k \n` +
           `- RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC-1k \n` +
           `- RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC-1k \n` +