diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7cad1a3583..0d716ae660 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 files: xinference
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 24.1a1
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index d0a8437ef3..d184ff512d 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -258,9 +258,6 @@ def serve(self, logging_conf: Optional[dict] = None):
                 f"{pprint.pformat(invalid_routes)}"
             )
 
-        for tp in [CreateChatCompletion, CreateCompletion]:
-            logger.debug("Dump request model fields:\n%s", tp.__fields__)
-
         class SPAStaticFiles(StaticFiles):
             async def get_response(self, path: str, scope):
                 response = await super().get_response(path, scope)
@@ -288,13 +285,11 @@ def read_main():
                 SPAStaticFiles(directory=ui_location, html=True),
             )
         else:
-            warnings.warn(
-                f"""
+            warnings.warn(f"""
             Xinference ui is not built at expected directory: {ui_location}
             To resolve this warning, navigate to {os.path.join(lib_location, "web/ui/")}
             And build the Xinference ui by running "npm run build"
-            """
-            )
+            """)
 
         config = Config(
             app=self._app, host=self._host, port=self._port, log_config=logging_conf
diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
index aa5b284a72..2f4d5f40aa 100644
--- a/xinference/core/chat_interface.py
+++ b/xinference/core/chat_interface.py
@@ -282,13 +282,10 @@ def retry(text, hist, max_tokens, temperature) -> Generator:
         ) as generate_interface:
             history = gr.State([])
 
-            Markdown(
-                f"""
+            Markdown(f"""
                 <h1 style='text-align: center; margin-bottom: 1rem'>🚀 Xinference Generate Bot : {self.model_name} 🚀</h1>
-                """
-            )
-            Markdown(
-                f"""
+                """)
+            Markdown(f"""
                 <div class="center">
                 Model ID: {self.model_uid}
                 </div>
@@ -301,8 +298,7 @@ def retry(text, hist, max_tokens, temperature) -> Generator:
                 <div class="center">
                 Model Quantization: {self.quantization}
                 </div>
-                """
-            )
+                """)
 
             with Column(variant="panel"):
                 textbox = Textbox(
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 5a887b72e6..2cfb46a5cb 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -63,9 +63,9 @@ def __init__(self):
         super().__init__()
         self._worker_address_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = {}
         self._worker_status: Dict[str, WorkerStatus] = {}
-        self._replica_model_uid_to_worker: Dict[
-            str, xo.ActorRefType["WorkerActor"]
-        ] = {}
+        self._replica_model_uid_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = (
+            {}
+        )
         self._model_uid_to_replica_info: Dict[str, ReplicaInfo] = {}
         self._uptime = None
         self._lock = asyncio.Lock()
diff --git a/xinference/deploy/utils.py b/xinference/deploy/utils.py
index 953c3030aa..d3d3343b13 100644
--- a/xinference/deploy/utils.py
+++ b/xinference/deploy/utils.py
@@ -60,7 +60,9 @@ def get_config_dict(
         "disable_existing_loggers": False,
         "formatters": {
             "formatter": {
-                "format": "%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s"
+                "format": (
+                    "%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s"
+                )
             },
         },
         "filters": {
diff --git a/xinference/model/embedding/__init__.py b/xinference/model/embedding/__init__.py
index fdbf7f90ae..0b538806ab 100644
--- a/xinference/model/embedding/__init__.py
+++ b/xinference/model/embedding/__init__.py
@@ -16,7 +16,7 @@
 import json
 import os
 
-from .core import EmbeddingModelSpec, get_cache_status
+from .core import MODEL_NAME_TO_REVISION, EmbeddingModelSpec, get_cache_status
 from .custom import CustomEmbeddingModelSpec, register_embedding, unregister_embedding
 
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
@@ -27,12 +27,16 @@
     (spec["model_name"], EmbeddingModelSpec(**spec))
     for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
 )
+for model_name, model_spec in BUILTIN_EMBEDDING_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 MODELSCOPE_EMBEDDING_MODELS = dict(
     (spec["model_name"], EmbeddingModelSpec(**spec))
     for spec in json.load(
         codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
     )
 )
+for model_name, model_spec in MODELSCOPE_EMBEDDING_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 
 from ...constants import XINFERENCE_MODEL_DIR
 
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index 2cd16deb31..a97f02960a 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -15,7 +15,8 @@
 import logging
 import os
 import shutil
-from typing import List, Optional, Tuple, Union, no_type_check
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple, Union, no_type_check
 
 import numpy as np
 from pydantic import BaseModel
@@ -23,11 +24,14 @@
 from ...constants import XINFERENCE_CACHE_DIR
 from ...types import Embedding, EmbeddingData, EmbeddingUsage
 from ..core import ModelDescription
-from ..utils import valid_model_revision
+from ..utils import is_model_cached, valid_model_revision
 
 logger = logging.getLogger(__name__)
 
 SUPPORTED_SCHEMES = ["s3"]
+# Used for check whether the model is cached.
+# Init when registering all the builtin models.
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 
 
 class EmbeddingModelSpec(BaseModel):
@@ -195,11 +199,7 @@ def cache(model_spec: EmbeddingModelSpec):
 def get_cache_status(
     model_spec: EmbeddingModelSpec,
 ) -> bool:
-    cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
-    )
-    meta_path = os.path.join(cache_dir, "__valid_download")
-    return valid_model_revision(meta_path, model_spec.model_revision)
+    return is_model_cached(model_spec, MODEL_NAME_TO_REVISION)
 
 
 class EmbeddingModel:
diff --git a/xinference/model/llm/ggml/chatglm.py b/xinference/model/llm/ggml/chatglm.py
index 4a2f637dbc..86ef20fe45 100644
--- a/xinference/model/llm/ggml/chatglm.py
+++ b/xinference/model/llm/ggml/chatglm.py
@@ -134,9 +134,9 @@ def _convert_raw_text_chunks_to_chat(
                     {
                         "index": 0,
                         "delta": {
-                            "content": token
-                            if isinstance(token, str)
-                            else token.content,
+                            "content": (
+                                token if isinstance(token, str) else token.content
+                            ),
                         },
                         "finish_reason": None,
                     }
@@ -223,8 +223,10 @@ def _handle_tools(generate_config) -> Optional[ChatCompletionMessage]:
             chatglm_tools.append(elem["function"])
         return {
             "role": "system",
-            "content": f"Answer the following questions as best as you can. You have access to the following tools:\n"
-            f"{json.dumps(chatglm_tools, indent=4, ensure_ascii=False)}",
+            "content": (
+                f"Answer the following questions as best as you can. You have access to the following tools:\n"
+                f"{json.dumps(chatglm_tools, indent=4, ensure_ascii=False)}"
+            ),
         }
 
     def chat(
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index 8c019ceeeb..4ec72f4a74 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -588,31 +588,57 @@ def cache_from_huggingface(
     return cache_dir
 
 
+def _check_revision(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+    builtin: list,
+    meta_path: str,
+) -> bool:
+    for family in builtin:
+        if llm_family.model_name == family.model_name:
+            specs = family.model_specs
+            for spec in specs:
+                if (
+                    spec.model_format == "pytorch"
+                    and spec.model_size_in_billions == llm_spec.model_size_in_billions
+                ):
+                    return valid_model_revision(meta_path, spec.model_revision)
+    return False
+
+
 def get_cache_status(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
 ) -> Union[bool, List[bool]]:
+    """
+    When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
+    so we should check both huggingface and modelscope cache files.
+    """
     cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
+    # check revision for pytorch model
     if llm_spec.model_format == "pytorch":
-        return _skip_download(
-            cache_dir,
-            llm_spec.model_format,
-            llm_spec.model_hub,
-            llm_spec.model_revision,
-            "none",
-        )
+        hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
+        ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
+        revisions = [
+            _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
+            _check_revision(
+                llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
+            ),
+        ]
+        return any(revisions)
+    # just check meta file for ggml and gptq model
     elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
         ret = []
         for q in llm_spec.quantizations:
-            ret.append(
-                _skip_download(
-                    cache_dir,
-                    llm_spec.model_format,
-                    llm_spec.model_hub,
-                    llm_spec.model_revision,
-                    q,
-                )
+            assert q is not None
+            hf_meta_path = _get_meta_path(
+                cache_dir, llm_spec.model_format, "huggingface", q
+            )
+            ms_meta_path = _get_meta_path(
+                cache_dir, llm_spec.model_format, "modelscope", q
             )
+            results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
+            ret.append(any(results))
         return ret
     else:
         raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
index 95e2bac596..17957fcda2 100644
--- a/xinference/model/llm/pytorch/core.py
+++ b/xinference/model/llm/pytorch/core.py
@@ -442,9 +442,9 @@ def _sanitize_generate_config(
             and self.model_family.prompt_style
             and self.model_family.prompt_style.stop_token_ids
         ):
-            generate_config[
-                "stop_token_ids"
-            ] = self.model_family.prompt_style.stop_token_ids.copy()
+            generate_config["stop_token_ids"] = (
+                self.model_family.prompt_style.stop_token_ids.copy()
+            )
 
         return generate_config
 
diff --git a/xinference/model/llm/tests/test_llm_family.py b/xinference/model/llm/tests/test_llm_family.py
index 7b5036e367..578b5996f1 100644
--- a/xinference/model/llm/tests/test_llm_family.py
+++ b/xinference/model/llm/tests/test_llm_family.py
@@ -289,6 +289,7 @@ def test_meta_file():
     cache_dir = cache_from_huggingface(family, spec, quantization=None)
     meta_path = _get_meta_path(cache_dir, spec.model_format, spec.model_hub, None)
     assert valid_model_revision(meta_path, "3d2b5f275bdf882b8775f902e1bfdb790e2cfc32")
+    shutil.rmtree(cache_dir)
 
 
 def test_parse_uri():
@@ -878,6 +879,7 @@ def test_get_cache_status_pytorch():
         model_size_in_billions=1,
         quantizations=["4-bit", "8-bit", "none"],
         model_id="facebook/opt-125m",
+        model_revision="3d2b5f275bdf882b8775f902e1bfdb790e2cfc32",
     )
     family = LLMFamilyV1(
         version=1,
diff --git a/xinference/model/rerank/__init__.py b/xinference/model/rerank/__init__.py
index d2ecd4c09c..581e980fe0 100644
--- a/xinference/model/rerank/__init__.py
+++ b/xinference/model/rerank/__init__.py
@@ -16,7 +16,7 @@
 import json
 import os
 
-from .core import RerankModelSpec, get_cache_status
+from .core import MODEL_NAME_TO_REVISION, RerankModelSpec, get_cache_status
 
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
 _model_spec_modelscope_json = os.path.join(
@@ -26,11 +26,15 @@
     (spec["model_name"], RerankModelSpec(**spec))
     for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
 )
+for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 MODELSCOPE_RERANK_MODELS = dict(
     (spec["model_name"], RerankModelSpec(**spec))
     for spec in json.load(
         codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
     )
 )
+for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 del _model_spec_json
 del _model_spec_modelscope_json
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index b51cabc34a..7e324d3ef3 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -15,6 +15,7 @@
 import logging
 import os
 import uuid
+from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -23,10 +24,14 @@
 from ...constants import XINFERENCE_CACHE_DIR
 from ...types import Document, DocumentObj, Rerank
 from ..core import ModelDescription
-from ..utils import valid_model_revision
+from ..utils import is_model_cached, valid_model_revision
 
 logger = logging.getLogger(__name__)
 
+# Used for check whether the model is cached.
+# Init when registering all the builtin models.
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
+
 
 class RerankModelSpec(BaseModel):
     model_name: str
@@ -126,11 +131,7 @@ def rerank(
 def get_cache_status(
     model_spec: RerankModelSpec,
 ) -> bool:
-    cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
-    )
-    meta_path = os.path.join(cache_dir, "__valid_download")
-    return valid_model_revision(meta_path, model_spec.model_revision)
+    return is_model_cached(model_spec, MODEL_NAME_TO_REVISION)
 
 
 def cache(model_spec: RerankModelSpec):
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index f1da3e75f0..e1ee21d117 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -16,11 +16,11 @@
 import os
 from json import JSONDecodeError
 from pathlib import Path
-from typing import Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple
 
 from fsspec import AbstractFileSystem
 
-from ..constants import XINFERENCE_ENV_MODEL_SRC
+from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
 
 logger = logging.getLogger(__name__)
 MAX_ATTEMPTS = 3
@@ -132,6 +132,17 @@ def valid_model_revision(
         return real_revision == expected_model_revision
 
 
+def is_model_cached(model_spec: Any, name_to_revisions_mapping: Dict):
+    cache_dir = os.path.realpath(
+        os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
+    )
+    meta_path = os.path.join(cache_dir, "__valid_download")
+    revisions = name_to_revisions_mapping[model_spec.model_name]
+    if model_spec.model_revision not in revisions:  # Usually for UT
+        revisions.append(model_spec.model_revision)
+    return any([valid_model_revision(meta_path, revision) for revision in revisions])
+
+
 def is_valid_model_name(model_name: str) -> bool:
     import re
 
diff --git a/xinference/web/ui/src/scenes/launch_model/embeddingCard.js b/xinference/web/ui/src/scenes/launch_model/embeddingCard.js
index a2e272f118..03db9ac388 100644
--- a/xinference/web/ui/src/scenes/launch_model/embeddingCard.js
+++ b/xinference/web/ui/src/scenes/launch_model/embeddingCard.js
@@ -266,6 +266,18 @@ const EmbeddingCard = ({
                 return <Chip label="ZH" variant="outlined" size="small" />
               }
             })()}
+            {(() => {
+              if (modelData.is_cached) {
+                return (
+                  <Chip
+                    label="Cached"
+                    variant="outlined"
+                    size="small"
+                    sx={{ marginLeft: '10px' }}
+                  />
+                )
+              }
+            })()}
             {(() => {
               if (is_custom && customDeleted) {
                 return (
diff --git a/xinference/web/ui/src/scenes/launch_model/index.js b/xinference/web/ui/src/scenes/launch_model/index.js
index 3d50328704..33aceee695 100644
--- a/xinference/web/ui/src/scenes/launch_model/index.js
+++ b/xinference/web/ui/src/scenes/launch_model/index.js
@@ -1,7 +1,8 @@
 import { TabContext, TabList, TabPanel } from '@mui/lab'
 import { Box, Tab } from '@mui/material'
-import React from 'react'
+import React, { useContext, useEffect, useState } from 'react'
 
+import { ApiContext } from '../../components/apiContext'
 import ErrorMessageSnackBar from '../../components/errorMessageSnackBar'
 import Title from '../../components/Title'
 import LaunchCustom from './launchCustom'
@@ -10,12 +11,42 @@ import LaunchLLM from './launchLLM'
 import LaunchRerank from './launchRerank'
 
 const LaunchModel = () => {
+  let endPoint = useContext(ApiContext).endPoint
   const [value, setValue] = React.useState('1')
+  const [gpuAvailable, setGPUAvailable] = useState(-1)
+
+  const { setErrorMsg } = useContext(ApiContext)
 
   const handleTabChange = (event, newValue) => {
     setValue(newValue)
   }
 
+  useEffect(() => {
+    if (gpuAvailable === -1) {
+      fetch(endPoint + '/v1/cluster/devices', {
+        method: 'GET',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+      }).then((res) => {
+        if (!res.ok) {
+          // Usually, if some errors happen here, check if the cluster is available
+          res.json().then((errorData) => {
+            setErrorMsg(
+              `Server error: ${res.status} - ${
+                errorData.detail || 'Unknown error'
+              }`
+            )
+          })
+        } else {
+          res.json().then((data) => {
+            setGPUAvailable(parseInt(data, 10))
+          })
+        }
+      })
+    }
+  }, [])
+
   return (
     <Box m="20px">
       <Title title="Launch Model" />
@@ -30,7 +61,7 @@ const LaunchModel = () => {
           </TabList>
         </Box>
         <TabPanel value="1" sx={{ padding: 0 }}>
-          <LaunchLLM />
+          <LaunchLLM gpuAvailable={gpuAvailable} />
         </TabPanel>
         <TabPanel value="2" sx={{ padding: 0 }}>
           <LaunchEmbedding />
@@ -39,7 +70,7 @@ const LaunchModel = () => {
           <LaunchRerank />
         </TabPanel>
         <TabPanel value="4" sx={{ padding: 0 }}>
-          <LaunchCustom />
+          <LaunchCustom gpuAvailable={gpuAvailable} />
         </TabPanel>
       </TabContext>
     </Box>
diff --git a/xinference/web/ui/src/scenes/launch_model/launchCustom.js b/xinference/web/ui/src/scenes/launch_model/launchCustom.js
index d8f67686b3..deb50776d7 100644
--- a/xinference/web/ui/src/scenes/launch_model/launchCustom.js
+++ b/xinference/web/ui/src/scenes/launch_model/launchCustom.js
@@ -5,7 +5,7 @@ import { ApiContext } from '../../components/apiContext'
 import EmbeddingCard from './embeddingCard'
 import ModelCard from './modelCard'
 
-const LaunchCustom = () => {
+const LaunchCustom = ({ gpuAvailable }) => {
   let endPoint = useContext(ApiContext).endPoint
   const [registrationData, setRegistrationData] = useState([])
   const { isCallingApi, setIsCallingApi } = useContext(ApiContext)
@@ -138,7 +138,7 @@ const LaunchCustom = () => {
                 <EmbeddingCard
                   url={endPoint}
                   modelData={filteredRegistration}
-                  cardHeight={350}
+                  cardHeight={380}
                   is_custom={true}
                 />
               )
@@ -147,6 +147,7 @@ const LaunchCustom = () => {
                 <ModelCard
                   url={endPoint}
                   modelData={filteredRegistration}
+                  gpuAvailable={gpuAvailable}
                   is_custom={true}
                 />
               )
diff --git a/xinference/web/ui/src/scenes/launch_model/launchEmbedding.js b/xinference/web/ui/src/scenes/launch_model/launchEmbedding.js
index 771b5d1c65..9d54c95ebe 100644
--- a/xinference/web/ui/src/scenes/launch_model/launchEmbedding.js
+++ b/xinference/web/ui/src/scenes/launch_model/launchEmbedding.js
@@ -22,10 +22,7 @@ const LaunchEmbedding = () => {
     const modelName = registration.model_name
       ? registration.model_name.toLowerCase()
       : ''
-    if (!modelName.includes(searchTerm.toLowerCase())) {
-      return false
-    }
-    return true
+    return modelName.includes(searchTerm.toLowerCase())
   }
 
   const update = async () => {
@@ -35,7 +32,7 @@ const LaunchEmbedding = () => {
       setIsCallingApi(true)
 
       const response = await fetch(
-        `${endPoint}/v1/model_registrations/embedding`,
+        `${endPoint}/v1/model_registrations/embedding?detailed=true`,
         {
           method: 'GET',
         }
@@ -54,6 +51,7 @@ const LaunchEmbedding = () => {
           return {
             ...(await desc.json()),
             is_builtin: registration.is_builtin,
+            is_cached: registration.cache_status,
           }
         })
       )
diff --git a/xinference/web/ui/src/scenes/launch_model/launchLLM.js b/xinference/web/ui/src/scenes/launch_model/launchLLM.js
index 78c8fcc70c..467f7d1624 100644
--- a/xinference/web/ui/src/scenes/launch_model/launchLLM.js
+++ b/xinference/web/ui/src/scenes/launch_model/launchLLM.js
@@ -11,18 +11,16 @@ import React, { useContext, useEffect, useState } from 'react'
 import { ApiContext } from '../../components/apiContext'
 import ModelCard from './modelCard'
 
-const LaunchLLM = () => {
+const LaunchLLM = ({ gpuAvailable }) => {
   let endPoint = useContext(ApiContext).endPoint
   const [registrationData, setRegistrationData] = useState([])
   const { isCallingApi, setIsCallingApi } = useContext(ApiContext)
   const { isUpdatingModel } = useContext(ApiContext)
-  const { setErrorMsg } = useContext(ApiContext)
 
   // States used for filtering
   const [searchTerm, setSearchTerm] = useState('')
 
   const [modelAbility, setModelAbility] = useState('all')
-  const [gpuAvailable, setGPUAvailable] = useState(-1)
 
   const handleChange = (event) => {
     setSearchTerm(event.target.value)
@@ -83,32 +81,6 @@ const LaunchLLM = () => {
     update()
   }, [])
 
-  useEffect(() => {
-    if (gpuAvailable === -1) {
-      fetch(endPoint + '/v1/cluster/devices', {
-        method: 'GET',
-        headers: {
-          'Content-Type': 'application/json',
-        },
-      }).then((res) => {
-        if (!res.ok) {
-          // Usually, if some errors happen here, check if the cluster is available
-          res.json().then((errorData) => {
-            setErrorMsg(
-              `Server error: ${res.status} - ${
-                errorData.detail || 'Unknown error'
-              }`
-            )
-          })
-        } else {
-          res.json().then((data) => {
-            setGPUAvailable(parseInt(data, 10))
-          })
-        }
-      })
-    }
-  }, [])
-
   const style = {
     display: 'grid',
     gridTemplateColumns: 'repeat(auto-fill, minmax(300px, 1fr))',
diff --git a/xinference/web/ui/src/scenes/launch_model/launchRerank.js b/xinference/web/ui/src/scenes/launch_model/launchRerank.js
index 51c9285b8a..07bed5a8cd 100644
--- a/xinference/web/ui/src/scenes/launch_model/launchRerank.js
+++ b/xinference/web/ui/src/scenes/launch_model/launchRerank.js
@@ -32,7 +32,7 @@ const LaunchRerank = () => {
       setIsCallingApi(true)
 
       const response = await fetch(
-        `${endPoint}/v1/model_registrations/rerank`,
+        `${endPoint}/v1/model_registrations/rerank?detailed=true`,
         {
           method: 'GET',
         }
@@ -51,6 +51,7 @@ const LaunchRerank = () => {
           return {
             ...(await desc.json()),
             is_builtin: registration.is_builtin,
+            is_cached: registration.cache_status,
           }
         })
       )
diff --git a/xinference/web/ui/src/scenes/launch_model/rerankCard.js b/xinference/web/ui/src/scenes/launch_model/rerankCard.js
index 0bfd400db6..92bdecb1bc 100644
--- a/xinference/web/ui/src/scenes/launch_model/rerankCard.js
+++ b/xinference/web/ui/src/scenes/launch_model/rerankCard.js
@@ -214,6 +214,18 @@ const RerankCard = ({ url, modelData }) => {
                 return <Chip label="ZH" variant="outlined" size="small" />
               }
             })()}
+            {(() => {
+              if (modelData.is_cached) {
+                return (
+                  <Chip
+                    label="Cached"
+                    variant="outlined"
+                    size="small"
+                    sx={{ marginLeft: '10px' }}
+                  />
+                )
+              }
+            })()}
           </div>
         </div>
         {hover ? (
diff --git a/xinference/web/ui/src/scenes/register_model/index.js b/xinference/web/ui/src/scenes/register_model/index.js
index 2deeb7fc04..ab9a6b1161 100644
--- a/xinference/web/ui/src/scenes/register_model/index.js
+++ b/xinference/web/ui/src/scenes/register_model/index.js
@@ -161,6 +161,8 @@ const RegisterModel = () => {
           roles: ps.roles,
           intra_message_sep: ps.intra_message_sep,
           inter_message_sep: ps.inter_message_sep,
+          stop: ps.stop ?? null,
+          stop_token_ids: ps.stop_token_ids ?? null,
         }
       }
     }
diff --git a/xinference/web/ui/src/scenes/running_models/index.js b/xinference/web/ui/src/scenes/running_models/index.js
index 9c2afb4fb5..e87bd57aea 100644
--- a/xinference/web/ui/src/scenes/running_models/index.js
+++ b/xinference/web/ui/src/scenes/running_models/index.js
@@ -405,7 +405,14 @@ const RunningModels = () => {
   }
 
   return (
-    <Box sx={{ height: '100%', width: '100%' }}>
+    <Box
+      sx={{
+        height: '100%',
+        width: '100%',
+        paddingLeft: '20px',
+        paddingTop: '20px',
+      }}
+    >
       <Title title="Running Models" />
       <TabContext value={tabValue}>
         <Box sx={{ borderBottom: 1, borderColor: 'divider' }}>