Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : add Deepseek support #5981 #6252

Closed
wants to merge 12 commits into from
17 changes: 17 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,20 @@ examples/jeopardy/results.txt
poetry.lock
poetry.toml
nppBackup

# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-sampling
/tests/test-tokenizer-0-llama
/tests/test-tokenizer-0-falcon
/tests/test-tokenizer-0-deepseek-coder
/tests/test-tokenizer-1-llama
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops
13 changes: 12 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ BUILD_TARGETS = \

# Binaries only useful for tests
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-llama-grammar tests/test-tokenizer-0-deepseek-coder tests/test-tokenizer-0-deepseek-llm \
tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
Expand Down Expand Up @@ -53,6 +54,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-llm" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down Expand Up @@ -962,6 +967,12 @@ tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

tests/test-tokenizer-0-deepseek-llm: tests/test-tokenizer-0-deepseek-llm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
Expand Down
149 changes: 147 additions & 2 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,78 @@ def from_model_architecture(cls, arch):
return cls._model_classes[arch]
except KeyError:
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None

@staticmethod
def from_model_architecture(model_architecture):
if model_architecture == "GPTNeoXForCausalLM":
return GPTNeoXModel
if model_architecture == "BloomForCausalLM":
return BloomModel
if model_architecture == "MPTForCausalLM":
return MPTModel
if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return BaichuanModel
if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
return FalconModel
if model_architecture == "GPTBigCodeForCausalLM":
return StarCoderModel
if model_architecture == "GPTRefactForCausalLM":
return RefactModel
if model_architecture == "PersimmonForCausalLM":
return PersimmonModel
if model_architecture == "LlamaForCausalLM":
return DeepseekCoderModel
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel
if model_architecture == "QWenLMHeadModel":
return QwenModel
if model_architecture == "Qwen2ForCausalLM":
return Model
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "GPT2LMHeadModel":
return GPT2Model
if model_architecture == "PhiForCausalLM":
return Phi2Model
if model_architecture == "PlamoForCausalLM":
return PlamoModel
if model_architecture == "CodeShellForCausalLM":
return CodeShellModel
if model_architecture == "OrionForCausalLM":
return OrionModel
if model_architecture == "InternLM2ForCausalLM":
return InternLM2Model
if model_architecture == "MiniCPMForCausalLM":
return MiniCPMModel
if model_architecture == "BertModel":
return BertModel

@staticmethod
def from_model_name(model_name: str):
model_name_lower = model_name.lower()
if model_name_lower in ("stablelmepoch", "llavastablelmepoch"):
return StableLMModel
if model_name_lower == "gptneox":
return GPTNeoXModel
if model_name_lower == "bloom":
return BloomModel
if model_name_lower == "mpt":
return MPTModel
if model_name_lower in ("baichuan"):
return BaichuanModel
if model_name_lower in ("falcon", "rw"):
return FalconModel
if model_name_lower == "gptbigcode":
return StarCoderModel
if model_name_lower == "gptrefact":
return RefactModel
if model_name_lower == "persimmon":
return PersimmonModel
if model_name_lower == "deepseekcoder":
return DeepseekCoderModel
if model_name_lower == "deepseekllm":
return DeepseekLLMModel
return Model

def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
Expand All @@ -226,6 +298,53 @@ def _get_part_names(self):
if self.num_parts == 1: # there's only one .bin file
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))

def _get_model_architecture(self) -> gguf.MODEL_ARCH:
arch = self.hparams["architectures"][0]
if arch == "GPTNeoXForCausalLM":
return gguf.MODEL_ARCH.GPTNEOX
if arch == "BloomForCausalLM":
return gguf.MODEL_ARCH.BLOOM
if arch == "MPTForCausalLM":
return gguf.MODEL_ARCH.MPT
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return gguf.MODEL_ARCH.BAICHUAN
if arch in ("FalconForCausalLM", "RWForCausalLM"):
return gguf.MODEL_ARCH.FALCON
if arch == "GPTBigCodeForCausalLM":
return gguf.MODEL_ARCH.STARCODER
if arch == "GPTRefactForCausalLM":
return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON
if arch == "LlamaForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN
if arch == "Qwen2ForCausalLM":
return gguf.MODEL_ARCH.QWEN2
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "GPT2LMHeadModel":
return gguf.MODEL_ARCH.GPT2
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2
if arch == "PlamoForCausalLM":
return gguf.MODEL_ARCH.PLAMO
if arch == "CodeShellForCausalLM":
return gguf.MODEL_ARCH.CODESHELL
if arch == "OrionForCausalLM":
return gguf.MODEL_ARCH.ORION
if arch == "InternLM2ForCausalLM":
return gguf.MODEL_ARCH.INTERNLM2
if arch == "MiniCPMForCausalLM":
return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel":
return gguf.MODEL_ARCH.BERT

raise NotImplementedError(f'Architecture "{arch}" not supported!')

# used for GPT-2 BPE and WordPiece vocabs
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
Expand Down Expand Up @@ -256,9 +375,10 @@ def get_basic_vocab(self) -> tuple[list[str], list[int]]:

return tokens, toktypes

def _set_vocab_gpt2(self) -> None:

def _set_vocab_gpt2(self, tokenizer_model:str = "gpt2") -> None:
tokens, toktypes = self.get_basic_vocab()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_model(tokenizer_model)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

Expand Down Expand Up @@ -1181,7 +1301,31 @@ def write_tensors(self):
n_dims = len(data.shape)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
@Model.register("LlamaForCausalLM")
class DeepseekCoderModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA

def set_gguf_parameters(self):
super().set_gguf_parameters()
head_count = self.hparams["num_attention_heads"]
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])

if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

def set_vocab(self):
self._set_vocab_gpt2("deepseek_coder")

class DeepseekLLMModel(DeepseekCoderModel):
def set_vocab(self):
self._set_vocab_gpt2("deepseek_llm")

@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model):
Expand Down Expand Up @@ -2459,6 +2603,7 @@ def parse_args() -> argparse.Namespace:
"model", type=Path,
help="directory containing model file",
)
parser.add_argument("--model-name", type=str, default=None, help="name of the model")

return parser.parse_args()

Expand Down
Loading
Loading