Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Classification & Regression multilabel #815

Merged
merged 14 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions cfg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
architecture:
backbone_dtype: int4
gradient_checkpointing: true
intermediate_dropout: 0.0
pretrained: true
pretrained_weights: ''
augmentation:
neftune_noise_alpha: 0.0
random_parent_probability: 0.0
skip_parent_probability: 0.0
token_mask_probability: 0.0
dataset:
add_eos_token_to_answer: false
add_eos_token_to_prompt: false
add_eos_token_to_system: false
add_prompt_answer_tokens: false
answer_column: Physics
chatbot_author: H2O.ai
chatbot_name: h2oGPT
data_sample: 0.02
data_sample_choice:
- Train
- Validation
limit_chained_samples: false
mask_prompt_labels: true
num_classes: 2
only_last_answer: false
parent_id_column: None
personalize: false
prompt_column:
- TITLE
prompt_column_separator: \n\n
system_column: None
text_answer_separator: ''
text_prompt_start: ''
text_system_start: ''
train_dataframe: /home/philipp/h2o-llmstudio/data/user/train.2/train.csv
validation_dataframe: None
validation_size: 0.01
validation_strategy: automatic
environment:
compile_model: false
deepspeed_allgather_bucket_size: 1000000
deepspeed_method: ZeRO2
deepspeed_reduce_bucket_size: 1000000
deepspeed_stage3_param_persistence_threshold: 1000000
deepspeed_stage3_prefetch_bucket_size: 1000000
find_unused_parameters: false
gpus:
- '0'
- '1'
- '2'
huggingface_branch: main
mixed_precision: true
mixed_precision_dtype: bfloat16
number_of_workers: 8
seed: -1
trust_remote_code: true
use_deepspeed: false
experiment_name: attractive-caracal.1.1.1.1.1.1.1
llm_backbone: h2oai/h2o-danube3-500m-chat
logging:
logger: None
neptune_project: ''
output_directory: /home/philipp/h2o-llmstudio/output/user/attractive-caracal.1.1.1.1.1.1.1/
prediction:
batch_size_inference: 0
metric: Accuracy
problem_type: text_causal_classification_modeling
tokenizer:
add_prompt_answer_tokens: false
max_length: 512
padding_quantile: 1.0
tokenizer_kwargs: '{"use_fast": true, "add_prefix_space": false}'
training:
attention_implementation: auto
batch_size: 2
differential_learning_rate: 1.0e-05
differential_learning_rate_layers:
- classification_head
drop_last_batch: true
epochs: 1
evaluate_before_training: true
evaluation_epochs: 1.0
freeze_layers: []
grad_accumulation: 1
gradient_clip: 0.0
learning_rate: 0.0001
lora: true
lora_alpha: 16
lora_dropout: 0.05
lora_r: 4
lora_target_modules: ''
lora_unfreeze_layers: []
loss_function: CrossEntropyLoss
optimizer: AdamW
save_checkpoint: last
schedule: Cosine
train_validation_data: false
use_dora: false
use_rslora: false
warmup_epochs: 0.0
weight_decay: 0.0
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
The column in the dataset containing the expected output.

For classification, this needs to be an integer column starting from zero containing the class label, while for regression, it needs to be a float column.
For classification, this needs to be an integer column starting from zero containing the class label, while for regression, it needs to be a float column.

Multiple target columns can be selected for classification and regression supporting multilabel problems.
In detail, we support the following cases:
- Multi-class classification requires a single column containing the class label
- Binary classification requires a single column containing a binary integer label
- Multilabel classification requires each column to refer to one label encoded with a binary integer label
- For regression, each target column requires a float value
1 change: 0 additions & 1 deletion llm_studio/app_utils/sections/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ async def dataset_import(
q.client["dataset/import/azure_conn_string"],
q.client["dataset/import/azure_container"],
)
print(files)

if not files:
ui_filename = ui.textbox(
Expand Down
2 changes: 1 addition & 1 deletion llm_studio/app_utils/sections/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ async def experiment_start(q: Q) -> None:
# Configuration flags:
# from_dataset -- take the values from the dataset config
# from_cfg -- take the values from the configuration file
# from_default -- take the values from the the default settings
# from_default -- take the values from the default settings
# from_dataset_args -- take the values from the dataset's q.args
# Otherwise -- take the values from the q.args (user input)

Expand Down
3 changes: 2 additions & 1 deletion llm_studio/python_configs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def check(self) -> Dict[str, List]:
A dictionary with two keys:
- "title": A list of error titles.
- "message": A list of error messages.
- "type": A list of error types, can be "error", "warning", "deprecated"
"""
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}
return errors
10 changes: 9 additions & 1 deletion llm_studio/python_configs/cfg_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,18 @@ def check_config_for_errors(cfg: DefaultConfigProblemBase) -> dict:
problem_type_errors = cfg.check()
errors["title"].extend(problem_type_errors["title"])
errors["message"].extend(problem_type_errors["message"])
errors["type"].extend(problem_type_errors["type"])
return errors


def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}
if not len(cfg.environment.gpus) > 0:
errors["title"] += ["No GPU selected"]
errors["message"] += [
"Please select at least one GPU to start the experiment! "
]
errors["type"].append("error")

if len(cfg.environment.gpus) > torch.cuda.device_count():
errors["title"] += ["More GPUs selected than available"]
Expand All @@ -47,6 +49,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
"that was created on a different machine. Please deselect all GPUs and "
"select the GPUs you want to use again. "
]
errors["type"].append("error")

stats = os.statvfs(".")
available_size = stats.f_frsize * stats.f_bavail
Expand All @@ -60,6 +63,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
"Please ensure that you have enough disk space before "
"starting the experiment."
]
errors["type"].append("error")

# see create_nlp_backbone
if (
Expand All @@ -71,6 +75,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
"Quantization is only supported for pretrained models. "
"Please enable pretrained model or disable quantization."
]
errors["type"].append("error")

if (
not cfg.training.lora
Expand All @@ -83,6 +88,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
"likely lead to unstable training. "
"Please use LORA or set Backbone Dtype to bfloat16 or float32."
]
errors["type"].append("warning")

if cfg.environment.use_deepspeed and cfg.architecture.backbone_dtype in [
"int8",
Expand All @@ -94,10 +100,12 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
f"{cfg.architecture.backbone_dtype}. "
"Please set backbone type to float16 or bfloat16 for using deepspeed."
]
errors["type"].append("error")
if cfg.environment.use_deepspeed and len(cfg.environment.gpus) < 2:
errors["title"] += ["Deepspeed not supported for single GPU."]
errors["message"] += [
"Deepspeed does not support single GPU training. "
"Please select more than one GPU or disable deepspeed."
]
errors["type"].append("error")
return errors
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class ConfigNLPCausalClassificationDataset(ConfigNLPCausalLMDataset):
)
system_column: str = "None"
prompt_column: Tuple[str, ...] = ("instruction", "input")
answer_column: str = "label"
answer_column: Tuple[str, ...] = ("label", "output") # type: ignore
num_classes: int = 1
parent_id_column: str = "None"

Expand Down Expand Up @@ -210,26 +210,63 @@ def __post_init__(self):
)

def check(self) -> Dict[str, List]:
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}

if self.training.loss_function == "CrossEntropyLoss":
if self.dataset.num_classes == 1:
errors["title"] += ["CrossEntropyLoss requires num_classes > 1"]
errors["message"] += [
"CrossEntropyLoss requires num_classes > 1, "
"but num_classes is set to 1."
if isinstance(self.dataset.answer_column, str):
errors["title"].append("Invalid answer_column type")
errors["message"].append(
"Providing the answer_column as a string is deprecated. "
"Please provide the answer_column as a list."
)
errors["type"].append("deprecated")
self.dataset.answer_column = [self.dataset.answer_column]

if len(self.dataset.answer_column) > 1:
if self.training.loss_function == "CrossEntropyLoss":
errors["title"] += [
"CrossEntropyLoss not supported for multilabel classification"
]
elif self.training.loss_function == "BinaryCrossEntropyLoss":
if self.dataset.num_classes != 1:
errors["title"] += ["BinaryCrossEntropyLoss requires num_classes == 1"]
errors["message"] += [
"BinaryCrossEntropyLoss requires num_classes == 1, "
"but num_classes is set to {}.".format(self.dataset.num_classes)
"CrossEntropyLoss requires a single multi-class answer column, "
"but multiple answer columns are set."
]
errors["type"].append("error")
if self.dataset.num_classes != len(self.dataset.answer_column):
errors["title"] += [
"Wrong number of classes for multilabel classification"
]
error_msg = (
"Multilabel classification requires "
"num_classes == num_answer_columns, "
"but num_classes is set to {} and num_answer_columns is set to {}."
).format(self.dataset.num_classes, len(self.dataset.answer_column))
errors["message"] += [error_msg]
errors["type"].append("error")
else:
if self.training.loss_function == "CrossEntropyLoss":
if self.dataset.num_classes == 1:
errors["title"] += ["CrossEntropyLoss requires num_classes > 1"]
errors["message"] += [
"CrossEntropyLoss requires num_classes > 1, "
"but num_classes is set to 1."
]
errors["type"].append("error")
elif self.training.loss_function == "BinaryCrossEntropyLoss":
if self.dataset.num_classes != 1:
errors["title"] += [
"BinaryCrossEntropyLoss requires num_classes == 1"
]
errors["message"] += [
"BinaryCrossEntropyLoss requires num_classes == 1, "
"but num_classes is set to {}.".format(self.dataset.num_classes)
]
errors["type"].append("error")

if self.dataset.parent_id_column not in ["None", None]:
errors["title"] += ["Parent ID column is not supported for classification"]
errors["message"] += [
"Parent ID column is not supported for classification datasets."
]
errors["type"].append("error")

return errors
Original file line number Diff line number Diff line change
Expand Up @@ -642,15 +642,17 @@ def __post_init__(self):
)

def check(self) -> Dict[str, List]:
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}
if self.prediction.temperature > 0 and not self.prediction.do_sample:
errors["title"] += ["Do sample needs to be enabled for temperature > 0"]
errors["message"] += [
"Please enable do sample if you want to use temperature > 0."
]
errors["type"].append("warning")
if self.prediction.temperature == 0 and self.prediction.do_sample:
errors["title"] += ["Temperature needs to be > 0 for do sample"]
errors["message"] += [
"Please increase temperature if you want to use do sample."
]
errors["type"].append("warning")
return errors
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,22 @@ def __post_init__(self):
)

def check(self) -> Dict[str, List]:
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}

if isinstance(self.dataset.answer_column, str):
errors["title"].append("Invalid answer_column type")
errors["message"].append(
"Providing the answer_column as a string is deprecated. "
"Please provide the answer_column as a list."
)
errors["type"].append("deprecated")
self.dataset.answer_column = [self.dataset.answer_column]

if self.dataset.parent_id_column not in ["None", None]:
errors["title"] += ["Parent ID column is not supported for regression"]
errors["message"] += [
"Parent ID column is not supported for regression datasets."
]
errors["type"].append("error")

return errors
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,11 @@ def __post_init__(self):
)

def check(self) -> Dict[str, List]:
errors: Dict[str, List] = {"title": [], "message": []}
errors: Dict[str, List] = {"title": [], "message": [], "type": []}
if self.prediction.temperature > 0 and not self.prediction.do_sample:
errors["title"] += ["Do sample needs to be enabled for temperature > 0"]
errors["message"] += [
"Please enable do sample if you want to use temperature > 0."
]
errors["type"].append("warning")
return errors
15 changes: 12 additions & 3 deletions llm_studio/src/datasets/conversation_chain_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,19 @@ def get_conversation_chain_ids(self, cfg, df):

def get_answers(self, df, cfg):
answer_column = cfg.dataset.answer_column
if answer_column in df.columns:
answers = df[answer_column].astype(str).tolist()
if isinstance(answer_column, (list, tuple)):
answers = []
for col in answer_column:
if col in df.columns:
answers.append(df[col].astype(str).tolist())
else:
answers.append(["" for _ in range(len(self.prompts))])
answers = [",".join(ans) for ans in zip(*answers)]
else:
answers = ["" for _ in range(len(self.prompts))]
if answer_column in df.columns:
answers = df[answer_column].astype(str).tolist()
else:
answers = ["" for _ in range(len(self.prompts))]
return answers

def get_systems(self, cfg, df):
Expand Down
Loading