Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add save/load for pt2e example #1927

Merged
merged 18 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 69 additions & 4 deletions examples/3.x_api/pytorch/cv/static_quant/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,17 @@
parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
help='quantize model')
parser.add_argument("--calib_iters", default=2, type=int,
parser.add_argument("--calib_iters", default=-1, type=int,
Kaihui-intel marked this conversation as resolved.
Show resolved Hide resolved
help="For calibration only.")
parser.add_argument('-o', '--output_dir', default='', type=str, metavar='PATH',
help='path to quantized result.')
parser.add_argument('--performance', dest='performance', action='store_true',
help='do benchmark')
parser.add_argument("--iters", default=-1, type=int,
Kaihui-intel marked this conversation as resolved.
Show resolved Hide resolved
help="For benchmark only.")
parser.add_argument('--int8', dest='int8', action='store_true',
help='Load quantized model')


best_acc1 = 0

Expand Down Expand Up @@ -288,8 +297,20 @@ def main_worker(gpu, ngpus_per_node, args):

prepared_model = prepare(exported_model, quant_config=quant_config)
# Calibrate
for i in range(args.calib_iters):
prepared_model(*example_inputs)
with torch.no_grad():
for i, (images, target) in enumerate(val_loader):
if i == args.calib_iters:
break
if args.gpu is not None and torch.cuda.is_available():
images = images.cuda(args.gpu, non_blocking=True)
if torch.backends.mps.is_available():
images = images.to('mps')
target = target.to('mps')
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)
# compute output
model(images)

q_model = convert(prepared_model)
# Compile the quantized model and replace the Q/DQ pattern with Q-operator
from torch._inductor import config
Expand All @@ -298,11 +319,55 @@ def main_worker(gpu, ngpus_per_node, args):
opt_model = torch.compile(q_model)
model = opt_model


if args.output_dir:
model.save(example_inputs=example_inputs, output_dir = args.output_dir)

if args.int8:
if args.output_dir:
print("load int8 model")
from neural_compressor.torch.quantization import load
model = load(args.output_dir)


if args.evaluate:
validate(val_loader, model, criterion, args)
return

if args.performance:
benchmark(val_loader, model, args)
return

def benchmark(val_loader, model, args):

total_iters = args.iters
warmup_iters = 5
with torch.no_grad():

for i, (images, target) in enumerate(val_loader):
if i == total_iters:
break
if i == warmup_iters:
start = time.time()

if args.gpu is not None and torch.cuda.is_available():
images = images.cuda(args.gpu, non_blocking=True)
if torch.backends.mps.is_available():
images = images.to('mps')
target = target.to('mps')
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)

# model inference
model(images)

if i % args.print_freq == 0:
print(f"benchmarking... {i+1}/{total_iters}")

end = time.time()
latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
print("Latency: {:.3f} ms".format(latency * 10**3))
print("Throughput: {:.3f} samples/sec".format(throughput))

def validate(val_loader, model, criterion, args):

Expand Down
89 changes: 89 additions & 0 deletions examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
tuned_checkpoint=saved_results
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" -e "
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd


echo $extra_cmd

if [ "${topology}" = "resnet18_pt2e_static" ]; then
model_name_or_path="resnet18"
fi

if [[ ${mode} == "accuracy" ]]; then
python main.py -a ${model_name_or_path} ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 main.py -a ${model_name_or_path} \
${dataset_location} -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
}

main "$@"
3 changes: 2 additions & 1 deletion examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ function init_params {
function run_tuning {
Kaihui-intel marked this conversation as resolved.
Show resolved Hide resolved
if [ "${topology}" = "resnet18_pt2e_static" ]; then
model_name_or_path="resnet18"
output_dir="saved_results"
fi
python main.py -a ${model_name_or_path} ${dataset_location} -q -e
python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${output_dir}
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
tuned_checkpoint=saved_results
task=lambada_openai
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
extra_cmd=$extra_cmd
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
extra_cmd=$extra_cmd
else
echo "Error: No such mode: ${mode}"
exit 1
fi

if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd

echo $extra_cmd

if [ "${topology}" = "opt_125m_pt2e_static" ]; then
model_name_or_path="facebook/opt-125m"
fi
if [[ ${mode} == "accuracy" ]]; then
python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
--model ${model_name_or_path} \
--batch_size ${batch_size} \
--output_dir ${tuned_checkpoint} \
${extra_cmd} ${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
}

main "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"--revision", default=None,
help="Transformers parameter: set the model hub commit number")
parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
parser.add_argument("--output_dir", nargs="?", default="./saved_results")
parser.add_argument("--output_dir", nargs="?", default="")
parser.add_argument("--quantize", action="store_true")
parser.add_argument("--approach", type=str, default='static',
help="Select from ['dynamic', 'static', 'weight-only']")
Expand Down Expand Up @@ -80,7 +80,7 @@ def get_example_inputs(tokenizer):
dynamic_shapes = {"input_ids": (batch, seq_len)}
example_inputs = get_example_inputs(tokenizer)
exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)

quant_config = get_default_static_config()
# prepare
prepare_model = prepare(exported_model, quant_config)
Expand All @@ -99,8 +99,23 @@ def get_example_inputs(tokenizer):
opt_model.config = user_model.config # for lm eval
user_model = opt_model

# save
if args.output_dir:
user_model.save(example_inputs=example_inputs, output_dir = args.output_dir)



if args.int8:
if args.output_dir:
print("Load int8 model.")
from neural_compressor.torch.quantization import load
model = load(args.output_dir)

model.config = user_model.config # for lm eval
user_model = model

if args.accuracy:

from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
Expand All @@ -120,29 +135,21 @@ def get_example_inputs(tokenizer):
print('Batch size = %d' % args.batch_size)

if args.performance:
# user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
batch_size, input_leng = args.batch_size, 512
example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
print("Batch size = {:d}".format(batch_size))
print("The length of input tokens = {:d}".format(input_leng))
import time

samples = args.iters * args.batch_size
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
limit=samples,
device="cpu",
)
start = time.time()
results = evaluate(eval_args)
end = time.time()
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity,none"]
else:
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Throughput: %.3f samples/sec' % (samples / (end - start)))
print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
print('Batch size = %d' % args.batch_size)
total_iters = args.iters
warmup_iters = 5
with torch.no_grad():
for i in range(total_iters):
if i == warmup_iters:
start = time.time()
user_model(example_inputs)
end = time.time()
latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
print("Latency: {:.3f} ms".format(latency * 10**3))
print("Throughput: {:.3f} samples/sec".format(throughput))
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ function run_tuning {

if [ "${topology}" = "opt_125m_pt2e_static" ]; then
model_name_or_path="facebook/opt-125m"
output_dir="saved_results"
fi
python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"
}

main "$@"
Loading
Loading