diff --git a/examples/pytorch/gpt/utils/huggingface_bloom_convert.py b/examples/pytorch/gpt/utils/huggingface_bloom_convert.py index d858e6600..7c78711f4 100644 --- a/examples/pytorch/gpt/utils/huggingface_bloom_convert.py +++ b/examples/pytorch/gpt/utils/huggingface_bloom_convert.py @@ -209,9 +209,8 @@ def handle_exceptions(model_config: PretrainedConfig, return param -def convert_and_save_parameter(config: PretrainedConfig, - name: str, - param: torch.nn.Parameter, +def convert_and_save_parameter(param_name: str, + param, tensor_para_size: Optional[int], save_dir: PathLike): """ Convert and save to FT parameter @@ -228,12 +227,6 @@ def convert_and_save_parameter(config: PretrainedConfig, save_dir: str or Path, a base directory of binary files. """ - # Preprocess - param_name = convert_parameter_name(name) - param = safe_transpose(param) - param = handle_exceptions(config, param_name, param) - - param = param.detach().cpu().numpy() save_dir = Path(save_dir) if not is_split_param(param_name): @@ -325,15 +318,23 @@ def main(): f'{len(list(model.parameters()))} params') if args.processes > 1: pool = multiprocessing.Pool(args.processes) - pool.starmap_async( - convert_and_save_parameter, - [(model.config, name, param, tp_size, save_dir) - for name, param in model.named_parameters()]) + star_args = [] + for name, param in model.named_parameters(): + # Preprocess + param_name = convert_parameter_name(name) + param = safe_transpose(param) + param = handle_exceptions(model.config, param_name, param) + star_args.append((param_name, param.detach().cpu().numpy(), tp_size, save_dir)) + pool.starmap_async(convert_and_save_parameter, star_args) pool.close() pool.join() else: for name, param in model.named_parameters(): - convert_and_save_parameter(model.config, name, param, tp_size, save_dir) + # Preprocess + param_name = convert_parameter_name(name) + param = safe_transpose(param) + param = handle_exceptions(model.config, param_name, param) + convert_and_save_parameter(param_name, param.detach().cpu().numpy(), tp_size, save_dir) elapsed_time = time.time() - start_time logger.info(f'Checkpoint conversion (HF >> FT) has done ' f'(elapsed time: {elapsed_time:.2f} sec)') diff --git a/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py b/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py index 7853a6ac3..bac188715 100644 --- a/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py +++ b/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py @@ -14,12 +14,14 @@ import argparse import configparser +import multiprocessing from datetime import datetime import logging from pathlib import Path import sys import os + dir_path = os.path.dirname(os.path.realpath(__file__)) sys.path.append(dir_path + "/../../../../3rdparty/transformers/src/") @@ -30,8 +32,10 @@ LOGGER = logging.getLogger(__name__) -rename_mapping={"relative_attention_num_buckets":"relative_attention_num_buckets_or_max_pos_seq_len"} -new_configs={"structure":{"t5_with_bias":"false", "use_gated_activation":"false", "position_embedding_type":"relative"}} +rename_mapping = {"relative_attention_num_buckets": "relative_attention_num_buckets_or_max_pos_seq_len"} +new_configs = { + "structure": {"t5_with_bias": "false", "use_gated_activation": "false", "position_embedding_type": "relative"}} + def get_weight_data_type(data_type): if data_type == "fp32": @@ -41,12 +45,13 @@ def get_weight_data_type(data_type): else: assert False, f"Invalid weight data type {data_type}" + def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type): model_dict = {} for name, param in model.named_parameters(): if name.find("decoder") != -1 and name.find("SelfAttention") != -1: model_dict[name] = param - + for i in range(model.decoder.config.num_layers): shape = model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T.shape qkv = torch.cat([model_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"].T, @@ -55,16 +60,16 @@ def fuse_decoder_qkv(model, factor, saved_dir, np_weight_data_type): qkv = qkv.reshape([shape[0], 3, shape[1]]) qkv = qkv.cpu().detach().numpy().astype(np_weight_data_type) - + split_vals = np.split(qkv, factor, axis=-1) for j in range(factor): saved_path = saved_dir / f"decoder.block.{i}.layer.0.SelfAttention.qkv.weight.{j}.bin" split_vals[j].tofile(saved_path.as_posix()) - -def split_and_convert_process(key, val, factor, saved_dir, np_weight_data_type): + + +def split_and_convert_process(key, val, factor, saved_dir): if val.dim() == 2: val = val.transpose(1, 0) - val = val.cpu().detach().numpy().astype(np_weight_data_type) saved_key = key LOGGER.debug(f"key: {key}, val.shape: {val.shape}") @@ -72,50 +77,50 @@ def split_and_convert_process(key, val, factor, saved_dir, np_weight_data_type): # shared weights, only need to convert the weights of rank 0 saved_path = saved_dir / f"{saved_key}.bin" val.tofile(saved_path.as_posix()) - + saved_path = saved_dir / f"{saved_key}_T.bin" val.T.tofile(saved_path.as_posix()) elif key.find("lm_head.weight") != -1: # lm_head weights, only need to convert the weights of rank 0 - val = val.transpose(1, 0) # For lm_head, we use TN gemm to compute, so we don't need to transpose + val = val.transpose(1, 0) # For lm_head, we use TN gemm to compute, so we don't need to transpose saved_path = saved_dir / f"{saved_key}.bin" val.tofile(saved_path.as_posix()) - + elif key.find("layer_norm.weight") != -1: # shared weights, only need to convert the weights of rank 0 saved_path = saved_dir / f"{saved_key}.bin" val.tofile(saved_path.as_posix()) elif ( - key.find("SelfAttention.o.weight") != -1 - or key.find("EncDecAttention.o.weight") != -1 - or key.find("DenseReluDense.wo.weight") != -1 - ): + key.find("SelfAttention.o.weight") != -1 + or key.find("EncDecAttention.o.weight") != -1 + or key.find("DenseReluDense.wo.weight") != -1 + ): split_vals = np.split(val, factor, axis=0) for j in range(factor): saved_path = saved_dir / f"{saved_key}.{j:d}.bin" split_vals[j].tofile(saved_path.as_posix()) elif ( - key.find("DenseReluDense.wi.weight") != -1 - or (key.find("encoder") != -1 and ( + key.find("DenseReluDense.wi.weight") != -1 + or (key.find("encoder") != -1 and ( key.find("SelfAttention.q.weight") != -1 or key.find("SelfAttention.k.weight") != -1 or key.find("SelfAttention.v.weight") != -1 + ) ) - ) - or key.find("EncDecAttention.q.weight") != -1 - or key.find("EncDecAttention.k.weight") != -1 - or key.find("EncDecAttention.v.weight") != -1 - ): + or key.find("EncDecAttention.q.weight") != -1 + or key.find("EncDecAttention.k.weight") != -1 + or key.find("EncDecAttention.v.weight") != -1 + ): split_vals = np.split(val, factor, axis=-1) for j in range(factor): saved_path = saved_dir / f"{saved_key}.{j:d}.bin" split_vals[j].tofile(saved_path.as_posix()) elif ( - key.find("DenseReluDense.wi_0.weight") != -1 - or key.find("DenseReluDense.wi_1.weight") != -1 - ): + key.find("DenseReluDense.wi_0.weight") != -1 + or key.find("DenseReluDense.wi_1.weight") != -1 + ): # For gated activation. if key.find("DenseReluDense.wi_0.weight") != -1: saved_key = key.replace("wi_0", "wi") @@ -131,20 +136,21 @@ def split_and_convert_process(key, val, factor, saved_dir, np_weight_data_type): saved_path = saved_dir / f"{saved_key}.{j:d}.bin" split_vals[j].tofile(saved_path.as_posix()) elif ( - key.find("decoder") != -1 and - ( - key.find("SelfAttention.q.weight") != -1 - or key.find("SelfAttention.k.weight") != -1 - or key.find("SelfAttention.v.weight") != -1 - ) - ): + key.find("decoder") != -1 and + ( + key.find("SelfAttention.q.weight") != -1 + or key.find("SelfAttention.k.weight") != -1 + or key.find("SelfAttention.v.weight") != -1 + ) + ): pass elif key.find("encoder.embed_tokens.weight") != -1 or \ - key.find("decoder.embed_tokens.weight") != -1: + key.find("decoder.embed_tokens.weight") != -1: LOGGER.warning(f"Not save {key}, using shared.weight directly.") else: LOGGER.warning(f"cannot find key '{key}' with shape {val.shape}") + def convert_checkpoint(args): saved_dir = Path(args.saved_dir) / f"{args.inference_tensor_para_size:d}-gpu" saved_dir.mkdir(parents=True, exist_ok=True) @@ -180,20 +186,29 @@ def convert_checkpoint(args): with open((saved_dir / f"config.ini").as_posix(), 'w') as configfile: config.write(configfile) np_weight_data_type = get_weight_data_type(args.weight_data_type) - + i_gpu_num = args.inference_tensor_para_size - - for name, param in t5_model.state_dict().items(): - split_and_convert_process(name, param, i_gpu_num, saved_dir, np_weight_data_type) + + pool = multiprocessing.Pool(args.processes) + pool.starmap_async(split_and_convert_process, + [(name, param.cpu().detach().numpy().astype(np_weight_data_type), i_gpu_num, saved_dir) + for name, param in t5_model.state_dict().items()]) + + pool.close() + pool.join() if not args.encoder_only: fuse_decoder_qkv(t5_model, i_gpu_num, saved_dir, np_weight_data_type) + if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-saved_dir", "-o", type=str, help="file name of output file", required=True) parser.add_argument("-in_file", "-i", type=str, help="file name of input checkpoint file", required=True) - parser.add_argument("-inference_tensor_para_size", "-i_g", type=int, help="How many gpus for inference", required=True) + parser.add_argument("-inference_tensor_para_size", "-i_g", type=int, help="How many gpus for inference", + required=True) + parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)", + default=4) parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]) parser.add_argument("--encoder_only", "-e", action="store_true") parser.add_argument("--verbose", action="store_true", help="Provide verbose messages") @@ -205,7 +220,7 @@ def convert_checkpoint(args): LOGGER.info(f"{key}: {vars(args)[key]}") LOGGER.info("========================================") - start_time = datetime.now() + start_time = datetime.now() convert_checkpoint(args) stop_time = datetime.now() run_time = (stop_time - start_time)