You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm trying to use Dataset.from_generator() to generate a large dataset.
Steps to reproduce the bug
Code to reproduce:
from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig
import torch
from tqdm import tqdm
from datasets import load_dataset
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")
model = torch.compile(model)
def generate_data(data_loader):
model.eval()
for batch in tqdm(data_loader):
input_ids = tokenizer(batch['instruction'], return_tensors='pt', padding=True, truncation=True).input_ids.to("cuda:0")
with torch.no_grad():
outputs = model.generate(input_ids, generation_config=generation_config)
decoder_hidden_states = outputs.decoder_hidden_states
for i, h in zip(batch['instruction'], decoder_hidden_states):
yield {"instruction": i, "decoder_hidden_states": h}
generation_config = GenerationConfig(
temperature=1,
max_new_tokens=1024,
do_sample=False,
num_return_sequences=1,
return_dict_in_generate=True,
output_scores=True,
output_hidden_states=True,
)
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
dataset = load_dataset("HuggingFaceH4/databricks_dolly_15k")
train_loader = DataLoader(dataset['train'], batch_size=2, shuffle=True)
dataset = Dataset.from_generator(generator=generate_data, gen_kwargs={"data_loader": train_loader})
dataset.save_to_disk("data/flant5_small_generation")
Expected behavior
The dataset should be generated and saved.
But the following error occurred:
Traceback (most recent call last):
File "/remote-home/xhwang/alpaca-lora/data_collection_t5.py", line 46, in <module>
dataset = Dataset.from_generator(generator=generate_data, gen_kwargs={"data_loader": train_loader})
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 1035, in from_generator
return GeneratorDatasetInputStream(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/io/generator.py", line 28, in __init__
self.builder = Generator(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/builder.py", line 336, in __init__
self.config, self.config_id = self._create_builder_config(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/builder.py", line 505, in _create_builder_config
config_id = builder_config.create_config_id(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/builder.py", line 179, in create_config_id
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/fingerprint.py", line 236, in hash
return cls.hash_default(value)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/fingerprint.py", line 229, in hash_default
return cls.hash_bytes(dumps(value))
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 726, in dumps
dump(obj, file)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 701, in dump
Pickler(file, recurse=True).dump(obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 394, in dump
StockPickler.dump(self, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 487, in dump
self.save(obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1186, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 972, in save_dict
self._batch_setitems(obj.items())
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 1311, in save_function
dill._dill._save_with_postproc(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1084, in _save_with_postproc
pickler._batch_setitems(iter(source.items()))
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 717, in save_reduce
save(state)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1186, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 972, in save_dict
self._batch_setitems(obj.items())
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 717, in save_reduce
save(state)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1186, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 972, in save_dict
self._batch_setitems(obj.items())
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 1311, in save_function
dill._dill._save_with_postproc(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1070, in _save_with_postproc
pickler.save_reduce(*reduction, obj=obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 717, in save_reduce
save(state)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 887, in save_tuple
save(element)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1186, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 972, in save_dict
self._batch_setitems(obj.items())
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 1311, in save_function
dill._dill._save_with_postproc(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1070, in _save_with_postproc
pickler.save_reduce(*reduction, obj=obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 717, in save_reduce
save(state)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 887, in save_tuple
save(element)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1186, in save_module_dict
StockPickler.save_dict(pickler, obj)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 972, in save_dict
self._batch_setitems(obj.items())
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 1003, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 1311, in save_function
dill._dill._save_with_postproc(
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 1084, in _save_with_postproc
pickler._batch_setitems(iter(source.items()))
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 998, in _batch_setitems
save(v)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 691, in save
dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/site-packages/dill/_dill.py", line 388, in save
StockPickler.save(self, obj, save_persistent_id)
File "/remote-home/xhwang/anaconda3/envs/alpaca-lora/lib/python3.10/pickle.py", line 578, in save
rv = reduce(self.proto)
TypeError: cannot pickle 'ConfigModuleInstance' object
Hi! It should work if you put model = torch.compile(model) inside the generate_data function. If a referenced object is outside, it needs to be pickable, and that's not the case for the compiled models (or functions).
Hi! It should work if you put model = torch.compile(model) inside the generate_data function. If a referenced object is outside, it needs to be pickable, and that's not the case for the compiled models (or functions).
Hi! Thank you for your reply! Everything works perfectly with your suggestion!
Describe the bug
I'm trying to use Dataset.from_generator() to generate a large dataset.
Steps to reproduce the bug
Code to reproduce:
Expected behavior
The dataset should be generated and saved.
But the following error occurred:
Environment info
datasets
version: 2.11.0The text was updated successfully, but these errors were encountered: