diff --git a/llm/auto_parallel/llama/README.md b/llm/auto_parallel/llama/README.md index 68f4849195ef..705cacd1c604 100644 --- a/llm/auto_parallel/llama/README.md +++ b/llm/auto_parallel/llama/README.md @@ -32,3 +32,32 @@ cd ../../../slm/model_zoo/gpt-3/external_ops/ && python3 setup.py install && cd 参考训练脚本 **run_pretrain_auto.sh**,并开启 `to_static=1`,运行8卡 dp2mp2pp2的并行策略。 您可以参考 **run_pretrain_auto.sh**,按需求修改相关参数进行训练。 + +## 4.推理 +推理流程包括:动态图推理 -> 动转静导出模型 -> 静态图推理。当前自动并行预训练保存的模型参数已支持用于动态图推理;动转静导出模型、静态图推理步骤请参考 [LLaMA 系列大模型运行文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/llama.md)。 + +以动态图自动并行训练(dp2mp2pp2)为例。 +- 分布式 ckpt 合并为单卡模型参数: + +```python +import paddle +import paddle.distributed as dist + +ckpt_path='/path/for/dist_ckpt' +# offload=1 将参数 offload 到 CPU,减少显存占用 +merged_state_dict = dist.checkpoint.load_state_dict.load_merged_state_dict(ckpt_path, offload=1) +paddle.save(unsharded_state_dict, 'model_state.pdparams') + +# 上述合并的模型参数格式为Paddle原生格式,如需转换为unified_param格式(safetensors),可继续执行如下代码: +python PaddleNLP/llm/auto_parallel/utils/convert_to_safetensors.py --input_path input_path [--output_path output_path] [--split_num split_num] [--offload offload] + +# 参数介绍 +--input_path: 输入的单卡模型参数路径 +--output_path: 可选,输出模型参数路径,默认为'./temp' +--split_num: 可选,输出的模型参数分片数,默认为 1 +--offload: 可选,是否将参数 offload 到 CPU,默认为 false +``` + +- 动态图推理 + + [大模型推理教程](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/inference.md) diff --git a/llm/auto_parallel/utils/convert_to_safetensors.py b/llm/auto_parallel/utils/convert_to_safetensors.py new file mode 100644 index 000000000000..6f000e1e8955 --- /dev/null +++ b/llm/auto_parallel/utils/convert_to_safetensors.py @@ -0,0 +1,92 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +import paddle +from safetensors.numpy import save_file as safe_save_file + +from paddlenlp.transformers.utils import dtype_byte_size +from paddlenlp.utils.env import SAFE_WEIGHTS_INDEX_NAME + + +def convert_to_unified_ckpt(path: str, output_dir: str = "./tmp", split_num: int = 1, offload: bool = False): + """ + Convert a single card checkpoint to the unified format. + + Args: + path (str): The path to the input checkpoint file. + output_dir (str, optional): The directory where the converted files will be saved. Defaults to ".". + split_num (int, optional): The number of shards to split the weights into output_dir. Defaults to 1. + offload (bool, optional): Whether to offload the weights to CPU memory before saving them. Defaults to False. + """ + + def get_sub_state_dict(sub_keys, state_dict, weight_filename, index_weight_file, total_size): + """ + Get the sub-state dict and update the index weight file and total size. + Args: + sub_keys (list): A list of keys that belong to this sub-state dict. + state_dict (dict): The original state dict. + weight_filename (str): The filename of the corresponding weight file. + index_weight_file (dict): The dictionary containing the mapping from keys to their corresponding weight filenames. + total_size (int): The total size of the model so far. + """ + sub_state_dict = {key: state_dict[key].numpy() for key in sub_keys} + for key in sub_keys: + index_weight_file[key] = weight_filename + total_size += state_dict[key].numel().item() * dtype_byte_size(state_dict[key].dtype) + return sub_state_dict, total_size + + if offload: + paddle.set_device("cpu") + state_dict = paddle.load(path) + all_keys = list(state_dict.keys()) + split_size = len(all_keys) // split_num + extra_keys = len(all_keys) % split_num + index_weight_file = {} + total_size = 0 + + os.makedirs(output_dir, exist_ok=True) + + index = 0 + for rank in range(split_num): + current_size = split_size + (1 if rank < extra_keys else 0) + sub_keys = all_keys[index : index + current_size] + index += current_size + weight_filename = f"model-{rank+1:04d}-of-{split_num:04d}.safetensors" + sub_state_dict, total_size = get_sub_state_dict( + sub_keys, state_dict, weight_filename, index_weight_file, total_size + ) + safe_save_file(sub_state_dict, os.path.join(output_dir, weight_filename)) + with open(os.path.join(output_dir, SAFE_WEIGHTS_INDEX_NAME), "w") as f: + json.dump({"metadata": {"total_size": total_size}, "weight_map": index_weight_file}, f, indent=4) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--input_path", type=str, required=True, help="The path to the input checkpoint file.") + parser.add_argument( + "--output_dir", type=str, default="./tmp", help="The directory where the converted files will be saved." + ) + parser.add_argument( + "--split_num", type=int, default=1, help="The number of shards to split the weights into output_dir." + ) + parser.add_argument( + "--offload", type=bool, help="Whether to offload the weights to CPU memory before saving them." + ) + args = parser.parse_args() + convert_to_unified_ckpt(args.input_path, args.output_dir, args.split_num, args.offload)