diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index 563c3e9a9..c7b9a8a93 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -247,13 +247,25 @@ def main(): zero_stage=args.zero_stage, compute_fp32_loss=args.compute_fp32_loss) + # Model bigscience/bloom-560m has large variance at ln_f.weight parameter + # This makes bf16 finetuning hard. + # In general, since we are replacing the model head, it makes sense to reset + # the LN that precedes it. + force_optimize_params = [] + if "bigscience/bloom-" in args.model_name_or_path: + torch.nn.init.ones_(rm_model.rwtransformer.ln_f.weight) + torch.nn.init.zeros_(rm_model.rwtransformer.ln_f.bias) + force_optimize_params.extend( + ['rwtransformer.ln_f.weight', 'rwtransformer.ln_f.bias']) + if args.lora_dim > 0: rm_model = convert_linear_layer_to_lora(rm_model, args.lora_module_name, args.lora_dim) if args.only_optimize_lora: - rm_model = only_optimize_lora_parameters( - rm_model, force_optimize_params=['v_head.weight']) + force_optimize_params.append('v_head.weight') + rm_model = only_optimize_lora_parameters(rm_model, + force_optimize_params) rm_model = make_model_gradient_checkpointing_compatible(rm_model) train_phase = 2