Skip to content

Commit

Permalink
fix rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
RayJi01 committed Aug 4, 2023
1 parent 4d5c06b commit b8be3cd
Show file tree
Hide file tree
Showing 4 changed files with 357 additions and 1 deletion.
2 changes: 2 additions & 0 deletions xinference/model/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def match_llm_cls(

def _install():
from .ggml.chatglm import ChatglmCppChatModel
from .ggml.ctransformer import CtransformerModel
from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
from .pytorch.baichuan import BaichuanPytorchChatModel
from .pytorch.chatglm import ChatglmPytorchChatModel
Expand All @@ -127,6 +128,7 @@ def _install():
FalconPytorchModel,
FalconPytorchChatModel,
ChatglmPytorchChatModel,
CtransformerModel,
]
)

Expand Down
184 changes: 184 additions & 0 deletions xinference/model/llm/ggml/ctransformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Iterator, Optional, Sequence, TypedDict, Union

from ctransformers import AutoConfig

from xinference.model.llm.ggml.ctransformers_util import generate_stream
from xinference.types import Completion, CompletionChunk

from ..core import LLM
from ..llm_family import LLMFamilyV1, LLMSpecV1
from .llamacpp import SIZE_TO_GPU_LAYERS

logger = logging.getLogger(__name__)


# class AutoConfig(TypedDict, total=False):
# top_k: int
# top_p: float
# temperature: float
# repetition_penalty: float
# last_n_tokens: float
# seed: int
# max_new_tokens: int
# stop: List[str]
# stream: bool
# reset: bool
# batch_size: int
# threads: int
# context_length: int
# gpu_layers: int


class CtransformerGenerateConfig(TypedDict, total=False):
max_new_tokens: Optional[int]
top_k: Optional[int]
top_p: Optional[float]
temperature: Optional[float]
repetition_penalty: Optional[float]
last_n_tokens: Optional[int]
seed: Optional[int]
batch_size: Optional[int]
threads: Optional[int]
stop: Optional[Sequence[str]]
stream: Optional[bool]
reset: Optional[bool]


class CtransformerModel(LLM):
def __init__(
self,
model_uid: str,
model_family: "LLMFamilyV1",
model_spec: "LLMSpecV1",
quantization: str,
model_path: str,
ctransformerModelConfig: Optional[AutoConfig] = None,
):
super().__init__(model_uid, model_family, model_spec, quantization, model_path)

closest_size = min(
SIZE_TO_GPU_LAYERS.keys(),
key=lambda x: abs(x - model_spec.model_size_in_billions),
)
self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
self._ctransformer_model_config: AutoConfig = self._sanitize_model_config(
model_path, ctransformerModelConfig
)
self._llm = None

def _sanitize_model_config(
self, model_path, ctransformerModelConfig: Optional[AutoConfig]
) -> AutoConfig:
if ctransformerModelConfig is None:
ctransformerModelConfig = AutoConfig.from_pretrained(
model_path,
local_files_only=False,
)

return ctransformerModelConfig

def _sanitize_generate_config(
self,
ctransformerGenerateConfig: Optional[CtransformerGenerateConfig],
) -> CtransformerGenerateConfig:
if ctransformerGenerateConfig is None:
ctransformerGenerateConfig = CtransformerGenerateConfig()
ctransformerGenerateConfig.setdefault("top_k", 40)
ctransformerGenerateConfig.setdefault("top_p", 0.95)
ctransformerGenerateConfig.setdefault("temperature", 0.8)
ctransformerGenerateConfig.setdefault("repetition_penalty", 1.1)
ctransformerGenerateConfig.setdefault("last_n_tokens", 64)
ctransformerGenerateConfig.setdefault("seed", -1)
ctransformerGenerateConfig.setdefault("batch_size", 8)
ctransformerGenerateConfig.setdefault("threads", -1)
ctransformerGenerateConfig.setdefault("stop", None)
ctransformerGenerateConfig.setdefault("stream", None)
ctransformerGenerateConfig.setdefault("reset", True)

return ctransformerGenerateConfig

def load(self):
try:
from ctransformers import AutoModelForCausalLM
except ImportError:
error_message = "Failed to import module 'ctransformers'"
if self._is_darwin_and_apple_silicon():
system = "Metal"
else:
system = "CUDA"

installation_guide = [
f"Please make sure 'ctransformers' is installed and {system} accelerator is provided.",
f"You can install it by checking out the repository for command for {system} platform:"
f"https://github.com/marella/ctransformers",
]

raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")

self._llm = AutoModelForCausalLM.from_pretrained(
model_path_or_repo_id=self._model_path,
model_type=self._model_type,
model_file=self._model_file,
config=self._ctransformer_model_config,
)

@classmethod
def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool:
if llm_spec.model_format != "ggmlv3":
return False
if llm_spec.model_id not in ["TheBloke/starcoder-GGML"]:
return False
if "chatglm" in llm_family.model_name:
return False
if "generate" not in llm_family.model_ability:
return False
return True

def generate(
self, prompt: str, generate_config: CtransformerGenerateConfig
) -> Union[Completion, Iterator[CompletionChunk]]:
def generator_wrapper(
_prompt: str,
_generate_config: CtransformerGenerateConfig,
) -> Iterator[CompletionChunk]:
assert self._llm is not None
for _completion_chunk, _ in generate_stream(
model=self._llm, prompt=_prompt, **_generate_config
):
yield _completion_chunk

generate_config = self._sanitize_generate_config(generate_config)

stream_or_not = generate_config.get("stream", False)
if stream_or_not:
return generator_wrapper(_prompt=prompt, _generate_config=generate_config)
else:
for completion_chunk, completion_usage in generate_stream(
self._model, prompt=prompt, **generate_config
):
pass

completion = Completion(
id=completion_chunk["id"],
object=completion_chunk["object"],
created=completion_chunk["created"],
model=completion_chunk["model"],
choices=completion_chunk["choices"],
usage=completion_usage,
)
return completion
143 changes: 143 additions & 0 deletions xinference/model/llm/ggml/ctransformers_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import time
import uuid
from typing import Iterator, Optional, Sequence, Tuple

from ctransformers.utils import utf8_split_incomplete

from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage


def _get(*values):
for value in values:
if value is not None:
return value


def generate_stream(
model,
prompt: str,
*,
max_new_tokens: Optional[int] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
temperature: Optional[float] = None,
repetition_penalty: Optional[float] = None,
last_n_tokens: Optional[int] = None,
seed: Optional[int] = None,
batch_size: Optional[int] = None,
stream: Optional[bool] = True,
threads: Optional[int] = None,
stop: Optional[Sequence[str]] = None,
reset: Optional[bool] = None,
) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
max_new_tokens = _get(max_new_tokens)
stop = _get(stop) or []
if isinstance(stop, str):
stop = [stop]

tokens = model.tokenize(prompt)

stop_regex = re.compile("|".join(map(re.escape, stop)))
count = 0
text = ""
incomplete = b""

# parameters needed for Xinference.
finish_reason = None

for token in model.generate(
tokens,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
batch_size=batch_size,
threads=threads,
reset=reset,
):
# Handle incomplete UTF-8 multi-byte characters.
incomplete += model.detokenize([token], decode=False)
complete, incomplete = utf8_split_incomplete(incomplete)
output = complete.decode(errors="ignore")
text += output

# https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706
# Check if one of the stop sequences is part of the text.
# Note that the stop sequence may not always be at the end of text.
if stop:
match = stop_regex.search(text)
if match:
text = text[: match.start()]
finish_reason = "stop"
break

# Avoid sending the longest suffix of text which is also a prefix
# of a stop sequence, as it can form a stop sequence with the text
# generated later.
longest = 0
for s in stop:
for i in range(len(s), 0, -1):
if text.endswith(s[:i]):
longest = max(i, longest)
break

end = len(text) - longest
if end > 0:
output = text[:end]
completion_choice = CompletionChoice(
text=output, index=0, logprobs=None, finish_reason=None
)
completion_chunk = CompletionChunk(
id=str(uuid.uuid1()),
object="text_completion",
created=int(time.time()),
model=model,
choices=[completion_choice],
)
completion_usage = CompletionUsage(
prompt_tokens=len(tokens),
completion_tokens=count + 1,
total_tokens=count + 1 + len(tokens),
)

yield completion_chunk, completion_usage
text = text[end:]

count += 1
if max_new_tokens is not None and count >= max_new_tokens:
finish_reason = "length"
break

completion_choice = CompletionChoice(
text=text, index=0, logprobs=None, finish_reason=finish_reason
)
completion_chunk = CompletionChunk(
id=str(uuid.uuid1()),
object="text_completion",
created=int(time.time()),
model=model,
choices=[completion_choice],
)
completion_usage = CompletionUsage(
prompt_tokens=len(tokens),
completion_tokens=count,
total_tokens=count + len(tokens),
)

yield completion_chunk, completion_usage
29 changes: 28 additions & 1 deletion xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,8 @@
"version": 1,
"model_name": "qwen-chat",
"model_lang": [
"en", "zh"
"en",
"zh"
],
"model_ability": [
"embed",
Expand Down Expand Up @@ -774,5 +775,31 @@
151643
]
}
},
{
"version": 1,
"model_name": "starcoder",
"model_lang": [
"en"
],
"model_ability":[
"generate"
],
"model_specs": [
{
"model_format": "ggmlv3",
"model_size_in_billions": 16,
"quantizations": [
"q4_0",
"q4_1",
"q5_0",
"q5_1",
"q8_0"
],
"model_id": "TheBloke/starcoder-GGML",
"model_file_name_template": "starcoder.ggmlv3.{quantization}.bin"
}
],
"prompt_style": null
}
]

0 comments on commit b8be3cd

Please sign in to comment.