smallcloudai · mitya52 · May 16, 2024 · May 14, 2024 · May 15, 2024
diff --git a/refact_known_models/huggingface.py b/refact_known_models/huggingface.py
@@ -240,5 +240,16 @@
         "required_memory_mb": 20000,
         "T": 4096,
         "filter_caps": ["completion", "finetune"],
-    }
+    },
+    # NOTE: this repo is gated so we need /tokenizer handler to load tokenizer from docker directly
+    "llama3/8b/instruct": {
+        "backend": "transformers",
+        "model_path": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model_class_kwargs": {
+            "torch_dtype": "bf16",
+        },
+        "required_memory_mb": 20000,
+        "T": 4096,
+        "filter_caps": ["chat"],
+    },
 }
diff --git a/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -3,6 +3,7 @@
 import copy
 import asyncio
 import aiohttp
+import aiofiles
 import termcolor
 import os
 import re
@@ -15,14 +16,14 @@
 from refact_utils.scripts import env
 from refact_utils.finetune.utils import running_models_and_loras
 from refact_webgui.webgui.selfhost_model_resolve import resolve_model_context_size
-from refact_webgui.webgui.selfhost_model_resolve import resolve_tokenizer_name_for_model
 from refact_webgui.webgui.selfhost_model_resolve import static_resolve_model
 from refact_webgui.webgui.selfhost_queue import Ticket
 from refact_webgui.webgui.selfhost_webutils import log
 from refact_webgui.webgui.selfhost_queue import InferenceQueue
 from refact_webgui.webgui.selfhost_model_assigner import ModelAssigner
 from refact_webgui.webgui.selfhost_login import RefactSession
 
+from pathlib import Path
 from pydantic import BaseModel
 from typing import List, Dict, Union, Optional, Tuple, Any
 
@@ -193,6 +194,7 @@ def __init__(self,
         self.add_api_route("/v1/chat/completions", self._chat_completions, methods=["POST"])
 
         self.add_api_route("/v1/models", self._models, methods=["GET"])
+        self.add_api_route("/tokenizer/{model_name}", self._tokenizer, methods=["GET"])
 
         self._inference_queue = inference_queue
         self._id2ticket = id2ticket
@@ -263,8 +265,8 @@ def _caps_base_data(self) -> Dict[str, Any]:
             "endpoint_embeddings_style": "openai",
             "size_embeddings": 768,
 
-            "tokenizer_path_template": "https://huggingface.co/$MODEL/resolve/main/tokenizer.json",
-            "tokenizer_rewrite_path": {model: t for model in models_available if (t := resolve_tokenizer_name_for_model(model, self._model_assigner))},
+            "tokenizer_path_template": "/tokenizer/$MODEL",
+            "tokenizer_rewrite_path": {model: model.replace("/", "--") for model in models_available},
             "caps_version": self._caps_version,
         }
 
@@ -293,6 +295,44 @@ def _select_default_lora_if_exists(model_name: str, running_models: List[str]):
 
         return Response(content=json.dumps(data, indent=4), media_type="application/json")
 
+    async def _local_tokenizer(self, model_path: str) -> str:
+        model_dir = Path(env.DIR_WEIGHTS) / f"models--{model_path.replace('/', '--')}"
+        tokenizer_paths = list(model_dir.rglob("tokenizer.json"))
+        if not tokenizer_paths:
+            raise HTTPException(404, detail=f"tokenizer.json for {model_path} does not exist")
+        if len(tokenizer_paths) > 1:
+            raise HTTPException(404, detail=f"multiple tokenizer.json for {model_path}")
+
+        data = ""
+        async with aiofiles.open(tokenizer_paths[0], mode='r') as f:
+            while True:
+                if not (chunk := await f.read(1024 * 1024)):
+                    break
+                data += chunk
+
+        return data
+
+    async def _passthrough_tokenizer(self, model_path: str) -> str:
+        try:
+            async with aiohttp.ClientSession() as session:
+                tokenizer_url = f"https://huggingface.co/{model_path}/resolve/main/tokenizer.json"
+                async with session.get(tokenizer_url) as resp:
+                    return await resp.text()
+        except:
+            raise HTTPException(404, detail=f"can't load tokenizer.json for passthrough {model_path}")
+
+    async def _tokenizer(self, model_name: str):
+        model_name = model_name.replace("--", "/")
+        if model_name in self._model_assigner.models_db:
+            model_path = self._model_assigner.models_db[model_name]["model_path"]
+            data = await self._local_tokenizer(model_path)
+        elif model_name in self._model_assigner.passthrough_mini_db:
+            model_path = self._model_assigner.passthrough_mini_db[model_name]["tokenizer_path"]
+            data = await self._passthrough_tokenizer(model_path)
+        else:
+            raise HTTPException(404, detail=f"model '{model_name}' does not exists in db")
+        return Response(content=data, media_type='application/json')
+
     async def _login(self, authorization: str = Header(None)) -> Dict:
         account = await self._account_from_bearer(authorization)
         return {

diff --git a/refact_webgui/webgui/selfhost_model_resolve.py b/refact_webgui/webgui/selfhost_model_resolve.py
@@ -44,11 +44,3 @@ def resolve_model_context_size(model_name: str, model_assigner: ModelAssigner) -
     if model_name in model_assigner.passthrough_mini_db:
         if max_tokens := model_assigner.passthrough_mini_db[model_name].get('T'):
             return min(PASSTHROUGH_MAX_TOKENS_LIMIT, max_tokens)
-
-
-def resolve_tokenizer_name_for_model(model_name: str, model_assigner: ModelAssigner) -> Optional[str]:
-    if model_name in model_assigner.models_db:
-        return model_assigner.models_db[model_name].get('model_path')
-
-    if model_name in model_assigner.passthrough_mini_db:
-        return model_assigner.passthrough_mini_db[model_name].get('tokenizer_path')