h2oai · psinger · Aug 12, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/cfg.yaml b/cfg.yaml
@@ -0,0 +1,103 @@
+architecture:
+    backbone_dtype: int4
+    gradient_checkpointing: true
+    intermediate_dropout: 0.0
+    pretrained: true
+    pretrained_weights: ''
+augmentation:
+    neftune_noise_alpha: 0.0
+    random_parent_probability: 0.0
+    skip_parent_probability: 0.0
+    token_mask_probability: 0.0
+dataset:
+    add_eos_token_to_answer: false
+    add_eos_token_to_prompt: false
+    add_eos_token_to_system: false
+    add_prompt_answer_tokens: false
+    answer_column: Physics
+    chatbot_author: H2O.ai
+    chatbot_name: h2oGPT
+    data_sample: 0.02
+    data_sample_choice:
+    - Train
+    - Validation
+    limit_chained_samples: false
+    mask_prompt_labels: true
+    num_classes: 2
+    only_last_answer: false
+    parent_id_column: None
+    personalize: false
+    prompt_column:
+    - TITLE
+    prompt_column_separator: \n\n
+    system_column: None
+    text_answer_separator: ''
+    text_prompt_start: ''
+    text_system_start: ''
+    train_dataframe: /home/philipp/h2o-llmstudio/data/user/train.2/train.csv
+    validation_dataframe: None
+    validation_size: 0.01
+    validation_strategy: automatic
+environment:
+    compile_model: false
+    deepspeed_allgather_bucket_size: 1000000
+    deepspeed_method: ZeRO2
+    deepspeed_reduce_bucket_size: 1000000
+    deepspeed_stage3_param_persistence_threshold: 1000000
+    deepspeed_stage3_prefetch_bucket_size: 1000000
+    find_unused_parameters: false
+    gpus:
+    - '0'
+    - '1'
+    - '2'
+    huggingface_branch: main
+    mixed_precision: true
+    mixed_precision_dtype: bfloat16
+    number_of_workers: 8
+    seed: -1
+    trust_remote_code: true
+    use_deepspeed: false
+experiment_name: attractive-caracal.1.1.1.1.1.1.1
+llm_backbone: h2oai/h2o-danube3-500m-chat
+logging:
+    logger: None
+    neptune_project: ''
+output_directory: /home/philipp/h2o-llmstudio/output/user/attractive-caracal.1.1.1.1.1.1.1/
+prediction:
+    batch_size_inference: 0
+    metric: Accuracy
+problem_type: text_causal_classification_modeling
+tokenizer:
+    add_prompt_answer_tokens: false
+    max_length: 512
+    padding_quantile: 1.0
+    tokenizer_kwargs: '{"use_fast": true, "add_prefix_space": false}'
+training:
+    attention_implementation: auto
+    batch_size: 2
+    differential_learning_rate: 1.0e-05
+    differential_learning_rate_layers:
+    - classification_head
+    drop_last_batch: true
+    epochs: 1
+    evaluate_before_training: true
+    evaluation_epochs: 1.0
+    freeze_layers: []
+    grad_accumulation: 1
+    gradient_clip: 0.0
+    learning_rate: 0.0001
+    lora: true
+    lora_alpha: 16
+    lora_dropout: 0.05
+    lora_r: 4
+    lora_target_modules: ''
+    lora_unfreeze_layers: []
+    loss_function: CrossEntropyLoss
+    optimizer: AdamW
+    save_checkpoint: last
+    schedule: Cosine
+    train_validation_data: false
+    use_dora: false
+    use_rslora: false
+    warmup_epochs: 0.0
+    weight_decay: 0.0
@@ -1,3 +1,10 @@
 The column in the dataset containing the expected output.
 
-For classification, this needs to be an integer column starting from zero containing the class label, while for regression, it needs to be a float column.
+For classification, this needs to be an integer column starting from zero containing the class label, while for regression, it needs to be a float column.
+
+Multiple target columns can be selected for classification and regression supporting multilabel problems.
+In detail, we support the following cases:
+- Multi-class classification requires a single column containing the class label
+- Binary classification requires a single column containing a binary integer label
+- Multilabel classification requires each column to refer to one label encoded with a binary integer label
+- For regression, each target column requires a float value
@@ -220,7 +220,6 @@ async def dataset_import(
                 q.client["dataset/import/azure_conn_string"],
                 q.client["dataset/import/azure_container"],
             )
-            print(files)
 
             if not files:
                 ui_filename = ui.textbox(

@@ -386,7 +386,7 @@ async def experiment_start(q: Q) -> None:
         # Configuration flags:
         # from_dataset -- take the values from the dataset config
         # from_cfg -- take the values from the configuration file
-        # from_default -- take the values from the the default settings
+        # from_default -- take the values from the default settings
         # from_dataset_args -- take the values from the dataset's q.args
         # Otherwise -- take the values from the q.args (user input)
 

@@ -246,6 +246,7 @@ def check(self) -> Dict[str, List]:
         A dictionary with two keys:
         - "title": A list of error titles.
         - "message": A list of error messages.
+        - "type": A list of error types, can be "error", "warning", "deprecated"
         """
-        errors: Dict[str, List] = {"title": [], "message": []}
+        errors: Dict[str, List] = {"title": [], "message": [], "type": []}
         return errors
@@ -27,16 +27,18 @@ def check_config_for_errors(cfg: DefaultConfigProblemBase) -> dict:
     problem_type_errors = cfg.check()
     errors["title"].extend(problem_type_errors["title"])
     errors["message"].extend(problem_type_errors["message"])
+    errors["type"].extend(problem_type_errors["type"])
     return errors
 
 
 def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
-    errors: Dict[str, List] = {"title": [], "message": []}
+    errors: Dict[str, List] = {"title": [], "message": [], "type": []}
     if not len(cfg.environment.gpus) > 0:
         errors["title"] += ["No GPU selected"]
         errors["message"] += [
             "Please select at least one GPU to start the experiment! "
         ]
+        errors["type"].append("error")
 
     if len(cfg.environment.gpus) > torch.cuda.device_count():
         errors["title"] += ["More GPUs selected than available"]
@@ -47,6 +49,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
             "that was created on a different machine. Please deselect all GPUs and "
             "select the GPUs you want to use again. "
         ]
+        errors["type"].append("error")
 
     stats = os.statvfs(".")
     available_size = stats.f_frsize * stats.f_bavail
@@ -60,6 +63,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
             "Please ensure that you have enough disk space before "
             "starting the experiment."
         ]
+        errors["type"].append("error")
 
     # see create_nlp_backbone
     if (
@@ -71,6 +75,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
             "Quantization is only supported for pretrained models. "
             "Please enable pretrained model or disable quantization."
         ]
+        errors["type"].append("error")
 
     if (
         not cfg.training.lora
@@ -83,6 +88,7 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
             "likely lead to unstable training. "
             "Please use LORA or set Backbone Dtype to bfloat16 or float32."
         ]
+        errors["type"].append("warning")
 
     if cfg.environment.use_deepspeed and cfg.architecture.backbone_dtype in [
         "int8",
@@ -94,10 +100,12 @@ def check_for_common_errors(cfg: DefaultConfigProblemBase) -> dict:
             f"{cfg.architecture.backbone_dtype}. "
             "Please set backbone type to float16 or bfloat16 for using deepspeed."
         ]
+        errors["type"].append("error")
     if cfg.environment.use_deepspeed and len(cfg.environment.gpus) < 2:
         errors["title"] += ["Deepspeed not supported for single GPU."]
         errors["message"] += [
             "Deepspeed does not support single GPU training. "
             "Please select more than one GPU or disable deepspeed."
         ]
+        errors["type"].append("error")
     return errors
@@ -28,7 +28,7 @@ class ConfigNLPCausalClassificationDataset(ConfigNLPCausalLMDataset):
     )
     system_column: str = "None"
     prompt_column: Tuple[str, ...] = ("instruction", "input")
-    answer_column: str = "label"
+    answer_column: Tuple[str, ...] = ("label", "output")  # type: ignore
     num_classes: int = 1
     parent_id_column: str = "None"
 
@@ -210,26 +210,63 @@ def __post_init__(self):
         )
 
     def check(self) -> Dict[str, List]:
-        errors: Dict[str, List] = {"title": [], "message": []}
+        errors: Dict[str, List] = {"title": [], "message": [], "type": []}
 
-        if self.training.loss_function == "CrossEntropyLoss":
-            if self.dataset.num_classes == 1:
-                errors["title"] += ["CrossEntropyLoss requires num_classes > 1"]
-                errors["message"] += [
-                    "CrossEntropyLoss requires num_classes > 1, "
-                    "but num_classes is set to 1."
+        if isinstance(self.dataset.answer_column, str):
+            errors["title"].append("Invalid answer_column type")
+            errors["message"].append(
+                "Providing the answer_column as a string is deprecated. "
+                "Please provide the answer_column as a list."
+            )
+            errors["type"].append("deprecated")
+            self.dataset.answer_column = [self.dataset.answer_column]
+
+        if len(self.dataset.answer_column) > 1:
+            if self.training.loss_function == "CrossEntropyLoss":
+                errors["title"] += [
+                    "CrossEntropyLoss not supported for multilabel classification"
                 ]
-        elif self.training.loss_function == "BinaryCrossEntropyLoss":
-            if self.dataset.num_classes != 1:
-                errors["title"] += ["BinaryCrossEntropyLoss requires num_classes == 1"]
                 errors["message"] += [
-                    "BinaryCrossEntropyLoss requires num_classes == 1, "
-                    "but num_classes is set to {}.".format(self.dataset.num_classes)
+                    "CrossEntropyLoss requires a single multi-class answer column, "
+                    "but multiple answer columns are set."
                 ]
+                errors["type"].append("error")
+            if self.dataset.num_classes != len(self.dataset.answer_column):
+                errors["title"] += [
+                    "Wrong number of classes for multilabel classification"
+                ]
+                error_msg = (
+                    "Multilabel classification requires "
+                    "num_classes == num_answer_columns, "
+                    "but num_classes is set to {} and num_answer_columns is set to {}."
+                ).format(self.dataset.num_classes, len(self.dataset.answer_column))
+                errors["message"] += [error_msg]
+                errors["type"].append("error")
+        else:
+            if self.training.loss_function == "CrossEntropyLoss":
+                if self.dataset.num_classes == 1:
+                    errors["title"] += ["CrossEntropyLoss requires num_classes > 1"]
+                    errors["message"] += [
+                        "CrossEntropyLoss requires num_classes > 1, "
+                        "but num_classes is set to 1."
+                    ]
+                    errors["type"].append("error")
+            elif self.training.loss_function == "BinaryCrossEntropyLoss":
+                if self.dataset.num_classes != 1:
+                    errors["title"] += [
+                        "BinaryCrossEntropyLoss requires num_classes == 1"
+                    ]
+                    errors["message"] += [
+                        "BinaryCrossEntropyLoss requires num_classes == 1, "
+                        "but num_classes is set to {}.".format(self.dataset.num_classes)
+                    ]
+                    errors["type"].append("error")
+
         if self.dataset.parent_id_column not in ["None", None]:
             errors["title"] += ["Parent ID column is not supported for classification"]
             errors["message"] += [
                 "Parent ID column is not supported for classification datasets."
             ]
+            errors["type"].append("error")
 
         return errors
@@ -642,15 +642,17 @@ def __post_init__(self):
         )
 
     def check(self) -> Dict[str, List]:
-        errors: Dict[str, List] = {"title": [], "message": []}
+        errors: Dict[str, List] = {"title": [], "message": [], "type": []}
         if self.prediction.temperature > 0 and not self.prediction.do_sample:
             errors["title"] += ["Do sample needs to be enabled for temperature > 0"]
             errors["message"] += [
                 "Please enable do sample if you want to use temperature > 0."
             ]
+            errors["type"].append("warning")
         if self.prediction.temperature == 0 and self.prediction.do_sample:
             errors["title"] += ["Temperature needs to be > 0 for do sample"]
             errors["message"] += [
                 "Please increase temperature if you want to use do sample."
             ]
+            errors["type"].append("warning")
         return errors
@@ -164,12 +164,22 @@ def __post_init__(self):
         )
 
     def check(self) -> Dict[str, List]:
-        errors: Dict[str, List] = {"title": [], "message": []}
+        errors: Dict[str, List] = {"title": [], "message": [], "type": []}
+
+        if isinstance(self.dataset.answer_column, str):
+            errors["title"].append("Invalid answer_column type")
+            errors["message"].append(
+                "Providing the answer_column as a string is deprecated. "
+                "Please provide the answer_column as a list."
+            )
+            errors["type"].append("deprecated")
+            self.dataset.answer_column = [self.dataset.answer_column]
 
         if self.dataset.parent_id_column not in ["None", None]:
             errors["title"] += ["Parent ID column is not supported for regression"]
             errors["message"] += [
                 "Parent ID column is not supported for regression datasets."
             ]
+            errors["type"].append("error")
 
         return errors
@@ -112,10 +112,11 @@ def __post_init__(self):
         )
 
     def check(self) -> Dict[str, List]:
-        errors: Dict[str, List] = {"title": [], "message": []}
+        errors: Dict[str, List] = {"title": [], "message": [], "type": []}
         if self.prediction.temperature > 0 and not self.prediction.do_sample:
             errors["title"] += ["Do sample needs to be enabled for temperature > 0"]
             errors["message"] += [
                 "Please enable do sample if you want to use temperature > 0."
             ]
+            errors["type"].append("warning")
         return errors
@@ -128,10 +128,19 @@ def get_conversation_chain_ids(self, cfg, df):
 
     def get_answers(self, df, cfg):
         answer_column = cfg.dataset.answer_column
-        if answer_column in df.columns:
-            answers = df[answer_column].astype(str).tolist()
+        if isinstance(answer_column, (list, tuple)):
+            answers = []
+            for col in answer_column:
+                if col in df.columns:
+                    answers.append(df[col].astype(str).tolist())
+                else:
+                    answers.append(["" for _ in range(len(self.prompts))])
+            answers = [",".join(ans) for ans in zip(*answers)]
         else:
-            answers = ["" for _ in range(len(self.prompts))]
+            if answer_column in df.columns:
+                answers = df[answer_column].astype(str).tolist()
+            else:
+                answers = ["" for _ in range(len(self.prompts))]
         return answers
 
     def get_systems(self, cfg, df):