Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(deps): upgrade trl and transformers #448

Merged
merged 11 commits into from
Feb 13, 2025
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ classifiers=[
dependencies = [
"numpy>=1.26.4,<2.0",
"accelerate>=0.20.3,!=0.34,<1.1",
"transformers>=4.45,<4.46",
"transformers>=4.46,<4.48.2",
"torch>=2.2.0,<2.5",
"sentencepiece>=0.1.99,<0.3",
"tokenizers>=0.13.3,<1.0",
"tqdm>=4.66.2,<5.0",
"trl>=0.9.3,<0.12",
"trl>=0.13,<0.15",
"peft>=0.8.0,<0.14",
"protobuf>=5.28.0,<6.0.0",
"datasets>=2.15.0,<3.0",
Expand Down
2 changes: 1 addition & 1 deletion tests/build/test_launch_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"num_train_epochs": 5,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"gradient_accumulation_steps": 1,
"learning_rate": 0.00001,
"weight_decay": 0,
"warmup_ratio": 0.03,
Expand Down
7 changes: 0 additions & 7 deletions tests/data/test_data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,13 +667,6 @@ def test_get_data_collator(
),
False,
),
# Pretokenized data with packing to True
(
configs.DataArguments(
training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL,
),
True,
),
],
)
def test_process_data_args_throws_error_where_needed(data_args, packing):
Expand Down
11 changes: 9 additions & 2 deletions tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import copy
import json
import os
import re
import tempfile

# Third Party
Expand Down Expand Up @@ -88,7 +89,7 @@
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
gradient_accumulation_steps=1,
learning_rate=0.00001,
weight_decay=0,
warmup_ratio=0.03,
Expand Down Expand Up @@ -1147,7 +1148,13 @@ def _validate_hf_resource_scanner_file(tempdir):


def _get_checkpoint_path(dir_path):
return os.path.join(dir_path, "checkpoint-5")
checkpoint_dirs = [
d
for d in os.listdir(dir_path)
if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d)
]
checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1]))
return os.path.join(dir_path, checkpoint_dirs[-1])


def _get_adapter_config(dir_path):
Expand Down
10 changes: 2 additions & 8 deletions tuning/data/setup_dataprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _process_dataconfig_file(


# Data Format 1: Pretokenized Data
def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):

# if the provided train dataset is pretokenized
# however user provides formatting flags, error out
Expand All @@ -96,12 +96,6 @@ def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
along with pretokenized train data"
)

# Support for packing pretokenized datasets has been merged in trl library
# see: https://github.com/huggingface/trl/pull/2011
# but we wait till a new transformers version is released to remove this check.
if packing:
raise ValueError("packing will not be used when datasets are pretokenized")

# We do not need a handler here as this is tokenized dataset
return [], None

Expand Down Expand Up @@ -264,7 +258,7 @@ def _process_raw_data_args(
if is_traindata_tokenized:
# Data Format 1: Pretokenized Data
handlers, dataset_text_field = _get_pretokenized_dataset_handlers(
data_args, packing, (is_eval_dataset_present and not is_evaldata_tokenized)
data_args, (is_eval_dataset_present and not is_evaldata_tokenized)
)
elif data_args.instruction_template and data_args.response_template:
# Data Format 2: Chat dataset with instruction and response template
Expand Down