OptimalScale · shizhediao · Apr 1, 2023 · Apr 1, 2023 · Apr 5, 2023 · Apr 6, 2023
diff --git a/examples/finetune.py b/examples/finetune.py
@@ -13,7 +13,7 @@
  foo = ClassFoo()
  bar = foo.FunctionBar()
 """
-
+import os
 import sys
 
 from transformers import HfArgumentParser
@@ -52,15 +52,31 @@ def main():
  pipeline_args=pipeline_args,
  )
  dataset = Dataset(data_args)
- model = AutoModel.get_model(model_args)
+
+ model = AutoModel.get_model(
+ model_args,
+ lang=data_args.lang,
+ forced_bos_token=data_args.forced_bos_token,
+ source_prefix = data_args.source_prefix,
+ streaming = data_args.streaming,
+ preprocessing_num_workers = data_args.preprocessing_num_workers,
+ overwrite_cache = data_args.overwrite_cache,
+ max_source_length = data_args.max_source_length,
+ max_target_length = data_args.max_target_length,
+ pad_to_max_length = data_args.pad_to_max_length
+ )
 
  # Tokenization and text grouping must be done in the main process
  with pipeline_args.main_process_first(desc="dataset map tokenization"):
  tokenized_dataset = model.tokenize(dataset)
- lm_dataset = finetuner.group_text(
- tokenized_dataset,
- model_max_length=model.get_max_length(),
- )
+ if model_args.arch_type == "encoder_decoder": 
+ # encoder-decoder model does not need group text
+ lm_dataset = tokenized_dataset
+ else:
+ lm_dataset = finetuner.group_text(
+ tokenized_dataset,
+ model_max_length=model.get_max_length(),
+ )
 
  # Finetuning
  tuned_model = finetuner.tune(model=model, lm_dataset=lm_dataset)

diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,8 @@ wandb==0.14.0
 deepspeed==0.8.3
 trl @ git+https://github.com/lvwerra/trl.git#egg=trl-0.4.1
 sentencepiece
+icetk==0.0.7
+cpm_kernels==1.0.11
 transformers @ git+https://github.com/huggingface/transformers@c612628
 flask
-flask_cors
+flask_cors
diff --git a/scripts/run_chatbot_seq2seq.sh b/scripts/run_chatbot_seq2seq.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+model=THUDM/chatglm-6b
+lora_args=""
+if [ $# -ge 1 ]; then
+ model=$1
+fi
+if [ $# -ge 2 ]; then
+ lora_args="--lora_model_path $2"
+fi
+
+CUDA_VISIBLE_DEVICES=0 \
+ deepspeed examples/chatbot.py \
+ --arch_type encoder_decoder \
+ --deepspeed configs/ds_config_chatbot.json \
+ --model_name_or_path ${model} \
+ ${lora_args}
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
@@ -13,13 +13,14 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Optional
+from pathlib import Path
+from typing import Optional, Union
 
 from transformers.utils.versions import require_version
-
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers import (
  MODEL_FOR_CAUSAL_LM_MAPPING,
- TrainingArguments,
+ TrainingArguments 
 )
 
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
@@ -99,6 +100,10 @@ class ModelArguments:
  default=None,
  metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
  )
+ arch_type: Optional[str] = field(
+ default="decoder_only",
+ metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
+ )
  config_overrides: Optional[str] = field(
  default=None,
  metadata={
@@ -165,6 +170,15 @@ class ModelArguments:
  default=True,
  metadata={"help": "Whether use disk mapping when memory is not enough."}
  )
+ resize_position_embeddings: Optional[bool] = field(
+ default=None,
+ metadata={
+ "help": (
+ "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
+ "the model's position embeddings."
+ )
+ },
+ )
 
  def __post_init__(self):
  if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -225,6 +239,8 @@ class DatasetArguments:
  each parameter, such as a help message.
  """
 
+ lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})
+
  dataset_path: Optional[str] = field(
  default=None, metadata={"help": "The path of the dataset to use."}
  )
@@ -309,6 +325,83 @@ class DatasetArguments:
  default=None,
  metadata={"help": "Evaluation File Path"},
  )
+ max_source_length: Optional[int] = field(
+ default=1024,
+ metadata={
+ "help": (
+ "The maximum total input sequence length after tokenization. Sequences longer "
+ "than this will be truncated, sequences shorter will be padded."
+ )
+ },
+ )
+ max_target_length: Optional[int] = field(
+ default=128,
+ metadata={
+ "help": (
+ "The maximum total sequence length for target text after tokenization. Sequences longer "
+ "than this will be truncated, sequences shorter will be padded."
+ )
+ },
+ )
+ val_max_target_length: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+ "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+ "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+ "during ``evaluate`` and ``predict``."
+ )
+ },
+ )
+ pad_to_max_length: bool = field(
+ default=False,
+ metadata={
+ "help": (
+ "Whether to pad all samples to model maximum sentence length. "
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+ "efficient on GPU but very bad for TPU."
+ )
+ },
+ )
+ max_predict_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+ "value if set."
+ )
+ },
+ )
+ num_beams: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+ "which is used during ``evaluate`` and ``predict``."
+ )
+ },
+ )
+ ignore_pad_token_for_loss: bool = field(
+ default=True,
+ metadata={
+ "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+ },
+ )
+ source_prefix: Optional[str] = field(
+ default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+ )
+
+ forced_bos_token: Optional[str] = field(
+ default=None,
+ metadata={
+ "help": (
+ "The token to force as the first generated token after the decoder_start_token_id."
+ "Useful for multilingual models like mBART where the first generated token"
+ "needs to be the target language token (Usually it is the target language token)"
+ )
+ },
+ )
 
  def __post_init__(self):
  if self.streaming:
@@ -330,7 +423,54 @@ class FinetunerArguments(TrainingArguments):
  """
  Adapt transformers.TrainingArguments
  """
- pass
+
+ """
+ Args:
+ sortish_sampler (`bool`, *optional*, defaults to `False`):
+ Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
+ for now but will become generally available in the near future.
+
+ It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
+ for the training set.
+ predict_with_generate (`bool`, *optional*, defaults to `False`):
+ Whether to use generate to calculate generative metrics (ROUGE, BLEU).
+ generation_max_length (`int`, *optional*):
+ The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
+ `max_length` value of the model configuration.
+ generation_num_beams (`int`, *optional*):
+ The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
+ `num_beams` value of the model configuration.
+ """
+
+ sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
+ predict_with_generate: bool = field(
+ default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
+ )
+ generation_max_length: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+ "to the `max_length` value of the model configuration."
+ )
+ },
+ )
+ generation_num_beams: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+ "to the `num_beams` value of the model configuration."
+ )
+ },
+ )
+ generation_config: Optional[Union[str, Path, GenerationConfig]] = field(
+ default=None,
+ metadata={
+ "help": "Model id, file path or url pointing to a GenerationConfig json file, to use during prediction."
+ },
+ )
+
 
 
 @dataclass

diff --git a/src/lmflow/datasets/dataset.py b/src/lmflow/datasets/dataset.py
@@ -169,7 +169,7 @@ def from_dict(self, dict_obj: dict, *args, **kwargs):
  return self
  else:
  raise NotImplementedError(
- f'Currently .from_dict is not supported for backend "{backend}"'
+ f'Currently .from_dict is not supported for backend "{self.backend}"'
  )
 
 
@@ -222,7 +222,7 @@ def to_dict(self):
  return dict_obj
  else:
  raise NotImplementedError(
- f'Current .to_dict is not supported for backend "{backend}"'
+ f'Current .to_dict is not supported for backend "{self.backend}"'
  )
 
 
@@ -251,7 +251,7 @@ def map(self, *args, **kwargs):
  else:
  # If the backend is not Hugging Face, raise a NotImplementedError
  raise NotImplementedError(
- f'Currently .map is not supported for backend "{backend}"'
+ f'Currently .map is not supported for backend "{self.backend}"'
  )
 
 

diff --git a/src/lmflow/models/auto_model.py b/src/lmflow/models/auto_model.py
@@ -4,11 +4,16 @@
 """
 
 from lmflow.models.hf_decoder_model import HFDecoderModel
-
+from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
 
 class AutoModel:
 
  @classmethod
  def get_model(self, model_args, *args, **kwargs):
  # TODO (add new models)
- return HFDecoderModel(model_args, *args, **kwargs)
+ if model_args.arch_type == "encoder_decoder":
+ return HFEncoderDecoderModel(model_args, *args, **kwargs)
+ elif model_args.arch_type == "decoder_only":
+ return HFDecoderModel(model_args, *args, **kwargs)
+ else:
+ raise NotImplementedError(f"Model type \"{model_args.arch_type}\" is not implemented.")
diff --git a/src/lmflow/models/encoder_decoder_model.py b/src/lmflow/models/encoder_decoder_model.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""A one-line summary of the module or program, terminated by a period.
+
+Leave one blank line. The rest of this docstring should contain an
+overall description of the module or program. Optionally, it may also
+contain a brief desription of exported classes and funcctions and/or usage
+examples.
+
+Typical usage example:
+
+ foo = ClassFoo()
+ bar = foo.FunctionBar()
+"""
+
+from lmflow.models.base_model import BaseModel
+
+
+class EncoderDecoderModel(BaseModel):
+
+ def __init__(self, *args, **kwargs):
+ pass
diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
@@ -49,10 +49,8 @@
 from lmflow.models.decoder_model import DecoderModel
 from lmflow.models.interfaces.tunable import Tunable
 
-
 logger = logging.getLogger(__name__)
 
-
 class HFDecoderModel(DecoderModel, Tunable):
  r"""
  Initializes a HFDecoderModel instance.