diff --git a/.pylintrc b/.pylintrc index 222bdf6cb..41f7e4e73 100644 --- a/.pylintrc +++ b/.pylintrc @@ -280,8 +280,8 @@ ignored-parents= # Maximum number of arguments for function / method. max-args=5 -# Maximum number of attributes for a class (see R0902). -max-attributes=7 +# Maximum number of attributes for a class (custom). +max-attributes=10 # Maximum number of boolean expressions in an if statement (see R0916). max-bool-expr=5 diff --git a/tests/data/test_data_preprocessing_utils.py b/tests/data/test_data_preprocessing_utils.py index c34204f4f..c61d3054a 100644 --- a/tests/data/test_data_preprocessing_utils.py +++ b/tests/data/test_data_preprocessing_utils.py @@ -280,7 +280,7 @@ def test_is_pretokenized_data(data, result): @pytest.mark.parametrize( - "packing, response_template, formatted_train_dataset, max_seq_length, expected_collator", + "packing, response_template, formatted_train_dataset, max_seq_length, instruction_template, expected_collator", [ ( False, @@ -291,6 +291,35 @@ def test_is_pretokenized_data(data, result): split="train", ), 1024, + None, + DataCollatorForCompletionOnlyLM, + ), + ( + False, + None, + Dataset.from_list( + [ + { + "input_ids": [9437, 29, 210], + "attention_mask": [1, 1, 1], + "labels": [1, 20, 30], + } + ] + ), + 1024, + None, + DataCollatorForSeq2Seq, + ), + ( + False, + "\n### Label:", + datasets.load_dataset( + "json", + data_files=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON, + split="train", + ), + 1024, + "\n### Text:", DataCollatorForCompletionOnlyLM, ), ( @@ -306,6 +335,7 @@ def test_is_pretokenized_data(data, result): ] ), 1024, + "\n### Text:", DataCollatorForSeq2Seq, ), ], @@ -315,6 +345,7 @@ def test_get_data_collator( response_template, formatted_train_dataset, max_seq_length, + instruction_template, expected_collator, ): """Ensure that the correct collator type is fetched based on the data args""" @@ -324,6 +355,7 @@ def test_get_data_collator( AutoTokenizer.from_pretrained(MODEL_NAME), is_pretokenized_dataset(formatted_train_dataset), max_seq_length, + instruction_template, ) assert isinstance(collator, expected_collator) diff --git a/tuning/config/configs.py b/tuning/config/configs.py index 222bf4424..6786d5410 100644 --- a/tuning/config/configs.py +++ b/tuning/config/configs.py @@ -102,6 +102,21 @@ class DataArguments: Supports both JSON and YAML based config files." }, ) + chat_template: str = field( + default=None, + metadata={ + "help": "chat template to use for tokenization. \ + No need to pass this if the tokenizer already has a chat_template \ + if passed, it will overwrite tokenizer.chat_template if it exists" + }, + ) + instruction_template: str = field( + default=None, + metadata={ + "help": "Should be provided for chat training. \ + Piece of text that determines the start of human response" + }, + ) @dataclass diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py index 589e4c9ef..1fa05d40d 100644 --- a/tuning/data/data_preprocessing_utils.py +++ b/tuning/data/data_preprocessing_utils.py @@ -28,6 +28,7 @@ def get_data_collator( tokenizer: AutoTokenizer, is_traindata_tokenized: bool, max_seq_length: int, + instruction_template: Optional[str], ) -> Callable: """Create and return the the appropriate collator type based on the configuration for packing, response_template, and dataset_text_field. @@ -43,12 +44,28 @@ def get_data_collator( Whether train Dataset is tokenized or not max_seq_length: int Max sequence length expected + instruction_template: str + str representing the human response in a chat template Returns: Callable Callable collator to be leveraged by the trainer. """ + if response_template and instruction_template: + # response_template_ids = tokenizer.encode( + # response_template, add_special_tokens=False + # )[2:] + # intruction_template_ids = tokenizer.encode( + # instruction_template, add_special_tokens=False + # )[2:] + return DataCollatorForCompletionOnlyLM( + response_template=response_template, + instruction_template=instruction_template, + tokenizer=tokenizer, + ignore_index=configs.IGNORE_INDEX, + ) + if not packing: # TODO: near term - how response template ids are parsed out needs to be cleaned. # The [2:] here applies if response template has \n prefix, it is needed to strip \n, diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py index f9be9a23e..3ba3c9e5f 100644 --- a/tuning/data/setup_dataprocessor.py +++ b/tuning/data/setup_dataprocessor.py @@ -34,8 +34,8 @@ from tuning.data.data_processors import get_datapreprocessor # In future we may make the fields configurable -DEFAULT_JSON_INPUT_KEY = "input" -DEFAULT_JSON_OUTPUT_KEY = "output" +DEFAULT_INPUT_COLUMN = "input" +DEFAULT_OUTPUT_COLUMN = "output" # check if the provided dataset is pretokenized or not # the check is taken from trl @@ -151,12 +151,12 @@ def _get_dataset_formatting_handlers(data_args, packing): return [handler], dataset_text_field -### Data format 3 -def _get_default_json_dataset_handlers(data_args, tokenizer_kwargs): +### Default Data format +def _get_default_dataset_handlers(data_args, tokenizer_kwargs): fn_kwargs = {} - fn_kwargs["input_field_name"] = DEFAULT_JSON_INPUT_KEY - fn_kwargs["output_field_name"] = DEFAULT_JSON_OUTPUT_KEY + fn_kwargs["input_field_name"] = DEFAULT_INPUT_COLUMN + fn_kwargs["output_field_name"] = DEFAULT_OUTPUT_COLUMN fn_kwargs["tokenizer_kwargs"] = tokenizer_kwargs kwargs = { @@ -177,7 +177,9 @@ def _get_default_json_dataset_handlers(data_args, tokenizer_kwargs): # If a text field is specified, append the tokenizer's EOS token to it. # If a formatter template is provided, apply it and save the result. # Data remains un-tokenized. -# Data Format 3: JSON Dataset with Input/Output Fields +# Data Format 3: Chat datasets +# User provides response_template and instruction_template. +# Default Data Format: Dataset with Input/Output Fields # Combine input and output fields, tokenize the data, and apply input attention masking. # Requires both input and output fields; throws an error if missing. def _process_raw_data_args( @@ -239,9 +241,13 @@ def _process_raw_data_args( handlers, dataset_text_field = _get_dataset_formatting_handlers( data_args, packing ) + elif data_args.instruction_template and data_args.response_template: + # Data Format 3: Chat dataset with instruction and response template + # We don't do processing for chat dataset + handlers, dataset_text_field = [], None else: - # Data Format 3: JSON Dataset with Input/Output Fields - handlers, dataset_text_field = _get_default_json_dataset_handlers( + # Default Data Format: Dataset with Input/Output Fields + handlers, dataset_text_field = _get_default_dataset_handlers( data_args, tokenizer_kwargs ) @@ -329,6 +335,7 @@ def process_dataargs( tokenizer, is_tokenized_dataset, max_seq_length, + data_args.instruction_template, ) dataset_kwargs = {} diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 2ad55c06b..e64a7bedc 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -287,6 +287,16 @@ def train( multiple_of=model_args.embedding_size_multiple_of, ) + if data_args.chat_template: + logger.info("adding chat_template to the tokenizer") + if tokenizer.chat_template: + logger.warning( + "replacing existing chat_template %s with the given chat_template %s", + tokenizer.chat_template, + data_args.chat_template, + ) + tokenizer.chat_template = data_args.chat_template + # Configure the collator and validate args related to packing prior to formatting the dataset data_collator = None logger.info("Packing is set to %s ", train_args.packing)