Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improvement(tools): optimize convert-pth-to-ggml #232

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 152 additions & 138 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
@@ -22,19 +22,29 @@
import struct
import numpy as np
import torch
import argparse
import os
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

os is already imported


from sentencepiece import SentencePieceProcessor

if len(sys.argv) < 3:
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)

# output in the same directory as the model
dir_model = sys.argv[1]
def parse_args():
parser = argparse.ArgumentParser(
description=
'Convert ckpt models to ggml models. For example: python3 convert-pth-to-ggml.py ../llama-models/7B/ f32 models/llama-7B '
)
parser.add_argument('dir_model',
type=str,
help='Directory path of the checkpoint model')
parser.add_argument('ftype',
type=str,
choices=['f32', 'f16'],
help='Data type of the converted tensor, f32 or f16')
parser.add_argument('out_dir',
type=str,
help='Directory path for storing ggml model')
return parser.parse_args()

fname_hparams = sys.argv[1] + "/params.json"
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"

def get_n_parts(dim):
if dim == 4096:
@@ -49,133 +59,137 @@ def get_n_parts(dim):
print("Invalid dim: " + str(dim))
sys.exit(1)

# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"

if os.path.exists(fname_out):
print(f"Skip conversion, it already exists: {fname_out}")
sys.exit(0)

with open(fname_hparams, "r") as f:
hparams = json.load(f)

tokenizer = SentencePieceProcessor(fname_tokenizer)

hparams.update({"vocab_size": tokenizer.vocab_size()})

n_parts = get_n_parts(hparams["dim"])

print(hparams)
print('n_parts = ', n_parts)

for p in range(n_parts):
print('Processing part ', p)

#fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
if (p > 0):
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)

model = torch.load(fname_model, map_location="cpu")

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["dim"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_heads"]))
fout.write(struct.pack("i", hparams["n_layers"]))
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", ftype))

# Is this correct??
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
# "<unk>" token (translated as ??)
text = " \u2047 ".encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
elif tokenizer.is_control(i):
# "<s>"/"</s>" tokens
fout.write(struct.pack("i", 0))
elif tokenizer.is_byte(i):
# "<U+XX>" tokens (which may be invalid UTF-8)
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print("Invalid token: " + piece)
sys.exit(1)
byte_value = int(piece[3:-1], 16)
fout.write(struct.pack("i", 1))
fout.write(struct.pack("B", byte_value))
def main():
args = parse_args()
dir_model = args.dir_model
out_dir = args.out_dir

if not os.path.exists(out_dir):
os.mkdir(out_dir)

ftype = args.ftype
ftype_int = {'f32': 0, 'f16': 1}
fname_hparams = os.path.join(dir_model, 'params.json')
fname_tokenizer = os.path.join(dir_model, '..', 'tokenizer.model')

with open(fname_hparams, "r") as f:
hparams = json.load(f)

tokenizer = SentencePieceProcessor(fname_tokenizer)

hparams.update({"vocab_size": tokenizer.vocab_size()})

n_parts = get_n_parts(hparams["dim"])

print(hparams)
print('n_parts = ', n_parts)

for p in range(n_parts):
print('Processing part ', p)

#fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_model = os.path.join(dir_model, "consolidated.0{}.pth".format(p))
if p > 0:
fname_out = os.path.join(out_dir,
"ggml-model-{}.bin.{}".format(ftype, p))
else:
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)

for k, v in model.items():
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape);

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
# print(" Transposing")
# data = data.transpose()

dshape = data.shape

# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(sname);

# data
data.tofile(fout)

# I hope this deallocates the memory ..
model = None

fout.close()

print("Done. Output file: " + fname_out + ", (part ", p, ")")
print("")
fname_out = os.path.join(out_dir,
"ggml-model-{}.bin".format(ftype))

model = torch.load(fname_model, map_location="cpu")

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["dim"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_heads"]))
fout.write(struct.pack("i", hparams["n_layers"]))
fout.write(struct.pack("i", hparams["dim"] //
hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", ftype_int[ftype]))

# Is this correct??
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
# "<unk>" token (translated as ??)
text = " \u2047 ".encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
elif tokenizer.is_control(i):
# "<s>"/"</s>" tokens
fout.write(struct.pack("i", 0))
elif tokenizer.is_byte(i):
# "<U+XX>" tokens (which may be invalid UTF-8)
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print("Invalid token: " + piece)
sys.exit(1)
byte_value = int(piece[3:-1], 16)
fout.write(struct.pack("i", 1))
fout.write(struct.pack("B", byte_value))
else:
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
text = tokenizer.id_to_piece(i).replace("\u2581",
" ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)

for k, v in model.items():
name = k
shape = v.shape

# skip layers.X.attention.inner_attention.rope.freqs
if name[-5:] == "freqs":
continue

print("Processing variable: " + name + " with shape: ", shape,
" and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape)

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
# print(" Transposing")
# data = data.transpose()

dshape = data.shape

# default type is fp16
ftype_cur = 1
if ftype == 'f32' or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0

# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(sname)

# data
data.tofile(fout)

# I hope this deallocates the memory ..
model = None

fout.close()

print("Done. Output file: " + fname_out + ", (part ", p, ")")
print("")


if __name__ == '__main__':
main()