You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
在A800上报错如下:
``
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
model.backward(loss)
main() File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
model.backward(loss)
main() File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
main()Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
model.backward(loss)model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
main()Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
ret_val = func(*args, **kwargs)Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
model.backward(loss)ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.backward(loss, retain_graph=retain_graph)
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(
scaled_loss.backward(retain_graph=retain_graph) File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
return user_fn(self, *args)ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
scaled_loss.backward(retain_graph=retain_graph) File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return bwd(*args, **kwargs)torch.autograd.backward(
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
return user_fn(self, *args) File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
if dim > 2:
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
UnboundLocalError: local variable 'dim' referenced before assignment
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
scaled_loss.backward(retain_graph=retain_graph)return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return bwd(*args, **kwargs)return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad) File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
torch.autograd.backward(if dim > 2:
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
UnboundLocalError: local variable 'dim' referenced before assignmenttorch.autograd.backward(outputs_with_grad, args_with_grad)
scaled_loss.backward(retain_graph=retain_graph)Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
torch.autograd.backward(if dim > 2:
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
UnboundLocalErrorreturn bwd(*args, **kwargs)
: local variable 'dim' referenced before assignment File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
if dim > 2:
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
UnboundLocalError: local variable 'dim' referenced before assignment
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
``
这是为什么呀,求解答
The text was updated successfully, but these errors were encountered:
运行全参微调脚本没有问题,但是lora脚本就会报错了。
具体使用脚本如下,和官仓给的一样:
``
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
OUTPUT=telechat-lora-test
ZERO_STAGE=3
MAX_LEN=1024
NUM_SAMPLES=1000
DATA_OUTPUT_PATH=datas/data_files
if [ "$OUTPUT" == "" ]; then
OUTPUT=./output
fi
if [ "$ZERO_STAGE" == "" ]; then
ZERO_STAGE=3
fi
mkdir -p $OUTPUT
python -u process_data.py
--data_path data.json
--tokenizer_path ../telechat-12B
--data_output_path $DATA_OUTPUT_PATH
--max_seq_len $MAX_LEN
--num_samples $NUM_SAMPLES
--num_workers 10
--process_method multiple
--seed 42
deepspeed --master_port 29500 main.py
--data_path $DATA_OUTPUT_PATH
--model_name_or_path ../telechat-12B
--with_loss_mask
--per_device_train_batch_size 1
--max_seq_len $MAX_LEN
--learning_rate 3e-5
--weight_decay 0.0001
--num_train_epochs 1
--gradient_accumulation_steps 4
--lr_scheduler_type cosine
--precision fp16
--warmup_proportion 0.1
--gradient_checkpointing
--seed 42
--zero_stage $ZERO_STAGE
--save_steps 10
--deepspeed
--lora_dim 8
--mark_only_lora_as_trainable
--lora_module_name "self_attention."
--output_dir $OUTPUT
2>&1 | tee $OUTPUT/training.log
``
在A800上报错如下:
``
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
model.backward(loss)
main() File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
model.backward(loss)
main() File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
main()Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
model.backward(loss)model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
main()Traceback (most recent call last):
ret_val = func(*args, **kwargs)Traceback (most recent call last):
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 405, in
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
main()
File "/data/MLLM/code/Telechat-master/deepspeed-telechat/sft/main.py", line 359, in main
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
model.backward(loss)ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.backward(loss, retain_graph=retain_graph)
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
model.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1976, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(
scaled_loss.backward(retain_graph=retain_graph) File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
return user_fn(self, *args)ret_val = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2213, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
scaled_loss.backward(retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
scaled_loss.backward(retain_graph=retain_graph) File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return bwd(*args, **kwargs)torch.autograd.backward(
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
return user_fn(self, *args) File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
if dim > 2:
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
UnboundLocalError: local variable 'dim' referenced before assignment
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
scaled_loss.backward(retain_graph=retain_graph)return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return bwd(*args, **kwargs)return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad) File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
torch.autograd.backward(if dim > 2:
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
UnboundLocalError: local variable 'dim' referenced before assignmenttorch.autograd.backward(outputs_with_grad, args_with_grad)
scaled_loss.backward(retain_graph=retain_graph)Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
torch.autograd.backward(if dim > 2:
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
UnboundLocalErrorreturn bwd(*args, **kwargs)
: local variable 'dim' referenced before assignment File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
if dim > 2:
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
UnboundLocalError: local variable 'dim' referenced before assignment
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/init.py", line 251, in backward
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
return user_fn(self, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 140, in decorate_bwd
return bwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py", line 97, in backward
if dim > 2:
UnboundLocalError: local variable 'dim' referenced before assignment
``
这是为什么呀,求解答
The text was updated successfully, but these errors were encountered: