Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tried to train #1

Closed
thistleknot opened this issue Jun 8, 2024 · 9 comments
Closed

tried to train #1

thistleknot opened this issue Jun 8, 2024 · 9 comments

Comments

@thistleknot
Copy link

thistleknot commented Jun 8, 2024

triton 2.2.0
torch 2.2.0
einops 0.7.0
compute compatibility 6.0
rocky linux 9
cuda 12.2
python 3.10

setup


# Disable parallelism for tokenizers to avoid potential issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Specify the model name (replace '' with your actual model name)
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).cuda().half()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# Initialize Trainer with model, tokenizer, and training arguments
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments
    tokenizer=tokenizer,                 # tokenizer
    train_dataset=your_train_dataset,    # replace with your training dataset
    eval_dataset=your_eval_dataset       # replace with your evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print(results)

# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


error

---------------------------------------------------------------------------
HFValidationError                         Traceback (most recent call last)
File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/utils/hub.py:398, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
    396 try:
    397     # Load from URL or cache if already cached
--> 398     resolved_file = hf_hub_download(
    399         path_or_repo_id,
    400         filename,
    401         subfolder=None if len(subfolder) == 0 else subfolder,
    402         repo_type=repo_type,
    403         revision=revision,
    404         cache_dir=cache_dir,
    405         user_agent=user_agent,
    406         force_download=force_download,
    407         proxies=proxies,
    408         resume_download=resume_download,
    409         token=token,
    410         local_files_only=local_files_only,
    411     )
    412 except GatedRepoError as e:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:110, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    109 if arg_name in ["repo_id", "from_id", "to_id"]:
--> 110     validate_repo_id(arg_value)
    112 elif arg_name == "token" and arg_value is not None:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:164, in validate_repo_id(repo_id)
    163 if not REPO_ID_REGEX.match(repo_id):
--> 164     raise HFValidationError(
    165         "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
    166         " forbidden, '-' and '.' cannot start or end the name, max length is 96:"
    167         f" '{repo_id}'."
    168     )
    170 if "--" in repo_id or ".." in repo_id:

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.

The above exception was the direct cause of the following exception:

OSError                                   Traceback (most recent call last)
Cell In[37], line 1
----> 1 trainer = SFTTrainer(
      2     model,
      3     train_dataset=dataset_dict['train'],
      4     eval_dataset=dataset_dict['validation'],
      5     args=training_args,
      6 )
      8 trainer.train()

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:238, in __init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics, peft_config, dataset_text_field, packing, formatting_func, max_seq_length, infinite, num_of_sequences, chars_per_token, dataset_num_proc, dataset_batch_size, neftune_noise_alpha, model_init_kwargs, dataset_kwargs, eval_packing)
    232         args = dataclasses.replace(args, gradient_checkpointing=False)
    233 elif getattr(args, "gradient_checkpointing", False) and (
    234     "use_reentrant" not in gradient_checkpointing_kwargs
    235     or gradient_checkpointing_kwargs["use_reentrant"]
    236 ):
    237     # For backward compatibility with older versions of transformers
--> 238     if hasattr(model, "enable_input_require_grads"):
    239         model.enable_input_require_grads()
    240     else:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:804, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    801     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    803 # Next, let's try to use the tokenizer_config file to get the tokenizer class.
--> 804 tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
    805 if "_commit_hash" in tokenizer_config:
    806     kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:637, in get_tokenizer_config(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)
    634     token = use_auth_token
    636 commit_hash = kwargs.get("_commit_hash", None)
--> 637 resolved_config_file = cached_file(
    638     pretrained_model_name_or_path,
    639     TOKENIZER_CONFIG_FILE,
    640     cache_dir=cache_dir,
    641     force_download=force_download,
    642     resume_download=resume_download,
    643     proxies=proxies,
    644     token=token,
    645     revision=revision,
    646     local_files_only=local_files_only,
    647     subfolder=subfolder,
    648     _raise_exceptions_for_gated_repo=False,
    649     _raise_exceptions_for_missing_entries=False,
    650     _raise_exceptions_for_connection_errors=False,
    651     _commit_hash=commit_hash,
    652 )
    653 if resolved_config_file is None:
    654     logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/utils/hub.py:462, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
    460     raise EnvironmentError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}")
    461 except HFValidationError as e:
--> 462     raise EnvironmentError(
    463         f"Incorrect path_or_model_id: '{path_or_repo_id}'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
    464     ) from e
    465 return resolved_file

OSError: Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub.

@ridgerchu
Copy link
Owner

Hi, I found that seems you are using the Tinyllama for training, instead of our model...

# Specify the model name (replace '' with your actual model name)
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

@thistleknot
Copy link
Author

thistleknot commented Jun 8, 2024 via email

@thistleknot
Copy link
Author

I re reviewed the Readme
Saw a models link
https://huggingface.co/ridger/MMfreeLM-2.7B
Will try those

@thistleknot
Copy link
Author

Revised code

from mmfreelm.models import HGRNBitConfig
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


os.environ["WANDB_MODE"]="offline"


model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = AutoModel.from_pretrained(model_name_or_path)

batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)



tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token


# Load the dataset
dataset = load_dataset("Abirate/english_quotes")

# Filter the quotes based on length
filtered_quotes = [q for q in dataset['train'] if 23 < len(q['quote']) < 140]

# Create a new dataset with the filtered quotes
filtered_dataset = Dataset.from_dict({'quote': [q['quote'] for q in filtered_quotes]})
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['quote'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)

# Split the dataset into training and validation
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict(train=train_test_split['train'], validation=train_test_split['test'])






# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1,
    gradient_accumulation_steps=1,
    learning_rate=0.001,
    warmup_ratio=0.1,
    logging_dir='./logs',
    logging_steps=1
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

but kernel bombs out

@ridgerchu
Copy link
Owner

Hi, does it work for inferencing instead of training? and do you use NVIDIA GPU for training?

@thistleknot
Copy link
Author

that's a great troubleshooting step

First error I made was AutoModel, and changed it to the custom class definition

then tried inference

from mmfreelm.models import HGRNBitConfig, HGRNBitForCausalLM
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


os.environ["WANDB_MODE"]="offline"


model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = HGRNBitForCausalLM.from_pretrained(model_name_or_path)
model.to('cuda')
batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)



tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token



# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
from mmfreelm.models import HGRNBitConfig, HGRNBitForCausalLM
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


os.environ["WANDB_MODE"]="offline"


model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = HGRNBitForCausalLM.from_pretrained(model_name_or_path)
model.to('cuda')
batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)



tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token



# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

results in kernel quitting (no console output).

nvidia yes (cuda 12.2, python 3.10)

@ridgerchu
Copy link
Owner

Hmmm do you installed the triton==2.2?

@thistleknot
Copy link
Author

Yes I updated the first post w my env

@ridgerchu
Copy link
Owner

Thanks! I see. I check your envs, and finds your compute compatibility is 6.0, indicating you are using Pascal arch. The Pascal arch is may not support well to triton, which may lead to this problem I guess... I use our A100 and H100 to test our code, which works well...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants