You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When finetune llama with custom dataset, this error occur: RuntimeError: chunk expects at least a 1-dimensional tensor
The same code works on single GPU but not works on multiple GPUs.
What is the possible reasons ?
Minimal reproducible example
importtorch, multiprocessingfromdatasetsimportload_dataset, DatasetfrompeftimportLoraConfig, prepare_model_for_kbit_trainingfromtransformersimport (
AutoModelForCausalLM,
AutoTokenizer,
set_seed,
TrainingArguments
)
fromtrlimportSFTTrainerfrompeft.utils.otherimportfsdp_auto_wrap_policyfromaccelerateimportAcceleratorimportosaccelerator=Accelerator()
set_seed(1234)
#use bf16 and FlashAttention if supportediftorch.cuda.is_bf16_supported():
os.system('pip install flash_attn')
compute_dtype=torch.bfloat16attn_implementation='flash_attention_2'else:
compute_dtype=torch.float16attn_implementation='sdpa'# Define model and tokenizermodel_id="meta-llama/Meta-Llama-3-8B-instruct"tokenizer=AutoTokenizer.from_pretrained(model_id)
model=AutoModelForCausalLM.from_pretrained(model_id, attn_implementation=attn_implementation)
# Dataset mapping------------------ifhasattr(accelerator.state, 'fsdp_plugin'):
fsdp_plugin=accelerator.state.fsdp_pluginfsdp_plugin.auto_wrap_policy=fsdp_auto_wrap_policy(trainer.model)
else:
print("FSDP plugin is not available.")
trainer.train()
output_dir="./fine-tuned-output"model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
iftrainer.is_fsdp_enabled:
trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
Output
Currently training with a batch size of: 2
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`, you can safely ignore this message.
***** Running training *****
Num examples = 489
Num Epochs = 4
Instantaneous batch size per device = 1
Training with DataParallel so batch size has been adjusted to: 2
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 16
Total optimization steps = 50
Number of trainable parameters = 41,943,040
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[42], line 8
6 else:
7 print("FSDP plugin is not available.")
----> 8 trainer.train()
10 output_dir="./fine-tuned-output"
12 model.save_pretrained(output_dir)
File /opt/conda/lib/python3.11/site-packages/transformers/trainer.py:2123, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2121 hf_hub_utils.enable_progress_bars()
2122 else:
-> 2123 return inner_training_loop(
2124 args=args,
2125 resume_from_checkpoint=resume_from_checkpoint,
2126 trial=trial,
2127 ignore_keys_for_eval=ignore_keys_for_eval,
2128 )
File /opt/conda/lib/python3.11/site-packages/transformers/trainer.py:2481, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2475 context = (
2476 functools.partial(self.accelerator.no_sync, model=model)
2477 if i == len(batch_samples) - 1
2478 else contextlib.nullcontext
2479 )
2480 with context():
-> 2481 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2483 if (
2484 args.logging_nan_inf_filter
2485 and not is_torch_xla_available()
2486 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2487 ):
2488 # if loss is nan or inf simply add the average of previous logged losses
2489 tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/conda/lib/python3.11/site-packages/transformers/trainer.py:3579, in Trainer.training_step(self, model, inputs, num_items_in_batch)
3576 return loss_mb.reduce_mean().detach().to(self.args.device)
3578 with self.compute_loss_context_manager():
-> 3579 loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
3581 del inputs
3582 if (
3583 self.args.torch_empty_cache_steps is not None
3584 and self.state.global_step % self.args.torch_empty_cache_steps == 0
3585 ):
File /opt/conda/lib/python3.11/site-packages/transformers/trainer.py:3633, in Trainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
3631 loss_kwargs["num_items_in_batch"] = num_items_in_batch
3632 inputs = {**inputs, **loss_kwargs}
-> 3633 outputs = model(**inputs)
3634 # Save past state if it exists
3635 # TODO: this needs to be fixed and made cleaner later.
3636 if self.args.past_index >= 0:
File /opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py:175, in DataParallel.forward(self, *inputs, **kwargs)
170 if t.device != self.src_device_obj:
171 raise RuntimeError("module must have its parameters and buffers "
172 f"on device {self.src_device_obj} (device_ids[0]) but found one of "
173 f"them on device: {t.device}")
--> 175 inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
176 # for forward function without any inputs, empty list and dict will be created
177 # so the module can be executed on one device which is the first one in device_ids
178 if not inputs and not module_kwargs:
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py:197, in DataParallel.scatter(self, inputs, kwargs, device_ids)
191 def scatter(
192 self,
193 inputs: Tuple[Any, ...],
194 kwargs: Optional[Dict[str, Any]],
195 device_ids: Sequence[Union[int, torch.device]],
196 ) -> Any:
--> 197 return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/scatter_gather.py:73, in scatter_kwargs(inputs, kwargs, target_gpus, dim)
71 r"""Scatter with support for kwargs dictionary."""
72 scattered_inputs = scatter(inputs, target_gpus, dim) if inputs else []
---> 73 scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
74 if len(scattered_inputs) < len(scattered_kwargs):
75 scattered_inputs.extend(() for _ in range(len(scattered_kwargs) - len(scattered_inputs)))
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/scatter_gather.py:59, in scatter(inputs, target_gpus, dim)
53 # After scatter_map is called, a scatter_map cell will exist. This cell
54 # has a reference to the actual function scatter_map, which has references
55 # to a closure that has a reference to the scatter_map cell (because the
56 # fn is recursive). To avoid this reference cycle, we set the function to
57 # None, clearing the cell
58 try:
---> 59 res = scatter_map(inputs)
60 finally:
61 scatter_map = None # type: ignore[assignment]
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/scatter_gather.py:50, in scatter.<locals>.scatter_map(obj)
48 return [list(i) for i in zip(*map(scatter_map, obj))]
49 if isinstance(obj, dict) and len(obj) > 0:
---> 50 return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
51 return [obj for _ in target_gpus]
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/scatter_gather.py:46, in scatter.<locals>.scatter_map(obj)
44 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
45 if isinstance(obj, tuple) and len(obj) > 0:
---> 46 return list(zip(*map(scatter_map, obj)))
47 if isinstance(obj, list) and len(obj) > 0:
48 return [list(i) for i in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/scatter_gather.py:42, in scatter.<locals>.scatter_map(obj)
40 def scatter_map(obj):
41 if isinstance(obj, torch.Tensor):
---> 42 return Scatter.apply(target_gpus, None, dim, obj)
43 if _is_namedtuple(obj):
44 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.11/site-packages/torch/autograd/function.py:553, in Function.apply(cls, *args, **kwargs)
550 if not torch._C._are_functorch_transforms_active():
551 # See NOTE: [functorch vjp and autograd interaction]
552 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 553 return super().apply(*args, **kwargs) # type: ignore[misc]
555 if not is_setup_ctx_defined:
556 raise RuntimeError(
557 "In order to use an autograd.Function with functorch transforms "
558 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
559 "staticmethod. For more details, please see "
560 "https://pytorch.org/docs/master/notes/extending.func.html"
561 )
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/_functions.py:96, in Scatter.forward(ctx, target_gpus, chunk_sizes, dim, input)
93 if torch.cuda.is_available() and ctx.input_device == -1:
94 # Perform CPU to GPU copies in a background stream
95 streams = [_get_stream(torch.device("cuda", device)) for device in target_gpus]
---> 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
97 # Synchronize with the copy stream
98 if streams is not None:
File /opt/conda/lib/python3.11/site-packages/torch/nn/parallel/comm.py:187, in scatter(tensor, devices, chunk_sizes, dim, streams, out)
185 if out is None:
186 devices = [_get_device_index(d) for d in devices]
--> 187 return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
188 else:
189 if devices is not None:
RuntimeError: chunk expects at least a 1-dimensional tensor
Describe the bug
When finetune llama with custom dataset, this error occur: RuntimeError: chunk expects at least a 1-dimensional tensor
The same code works on single GPU but not works on multiple GPUs.
What is the possible reasons ?
Minimal reproducible example
Output
Runtime Environment
llama-3.1 8b
]: llama-3.1 8bAdditional context
Add any other context about the problem or environment here.
The text was updated successfully, but these errors were encountered: