add alpaca gpt4 dataset (LAION-AI#2610)

The inputs can be quite a lot of different versions of `no input`, therefore don't use the `input` column for that. In some cases the text in `input` is already in the instruction, in these cases, we also don't use the `input` column. I am not quite sure how to concatenate the `instruction` and the `input` column. In most cases it seems fine to just replace last appearance of `.`, `!` or `?` with a colon, e.g.: Instruction: `Identify the odd one out.` Input: `Twitter, Instagram, Telegram` or Instruction: `How dense is a given material?` Input: `Steel` But we also have some questions like: Instruction: `Given the following synopsis, what is the moral lesson of this story?` Input: `Once upon a time, there was a poor young boy who wanted some candy. He begged his father for money to buy it, but his father said no and ordered him to go to bed. As he was going to bed, the boy saw a five-dollar bill on the counter, which he took and bought the candy.` Where this might not be the best case. Either way, I think the this one token will not make significant difference the model and therefore I just concatenate instruction and input with a space.
Cecileblbl · Apr 20, 2023 · b9c60ed · b9c60ed
1 parent cd71d2a
commit b9c60ed
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 0 deletions.
diff --git a/model/model_training/custom_datasets/__init__.py b/model/model_training/custom_datasets/__init__.py
@@ -10,6 +10,7 @@
 from model_training.custom_datasets.prompt_dialogue import Gpt4All, load_oig_file
 from model_training.custom_datasets.qa_datasets import (
     SODA,
+    AlpacaGpt4,
     DatabricksDolly15k,
     JokeExplaination,
     QADataset,
@@ -162,6 +163,8 @@ def get_one_dataset(
         train, eval = load_hellaswag()
     elif dataset_name == "dolly15k":
         dataset = DatabricksDolly15k(cache_dir=data_path)
+    elif dataset_name == "alpaca_gpt4":
+        dataset = AlpacaGpt4(cache_dir=data_path, **kwargs)
     else:
         raise ValueError(f"Unknown dataset {dataset_name}")
 

diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py
@@ -4,6 +4,7 @@
 import glob
 import json
 import os
+import random
 import re
 from collections import defaultdict
 from pathlib import Path
@@ -20,6 +21,9 @@
 re_reference_remove = re.compile(r"\[\d+(?:,\s*\d+)*?\]")
 
 
+LINKING_CHARS = ["\n", "\n\n", " "]
+
+
 def index_squad_v2(example):
     if len(example["answers"]["text"]):
         answer = example["answers"]["text"][0]
@@ -572,3 +576,44 @@ def __getitem__(self, index: int) -> list[str] | tuple[str]:
             return dialogue
         elif self.mode == "rl":
             return tuple(dialogue[:-1])
+
+
+class AlpacaGpt4(Dataset):
+    def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 2048) -> None:
+        super().__init__()
+        self.rows = []
+        if mode not in ("sft", "rl"):
+            raise NotImplementedError(f"Currently only the modes 'sft' and 'rl' are implemented. Received {mode}.")
+        self.mode = mode
+        data = load_dataset("vicgalle/alpaca-gpt4", cache_dir=cache_dir)
+        for line in data["train"]:
+            if (conv := self._process_instruction(line, input_max_length)) is not None:
+                self.rows.append(conv)
+
+    def _process_instruction(self, row: dict[str, str], input_max_length: int) -> list[str] | None:
+        # discard items that are too long: when checked on 2023-04-17 this was just one item in the whole dataset with length above 2048.
+        # And 12 above 1024.
+        if len(row["input"]) + len(row["instruction"]) > input_max_length:
+            return None
+        # filter all appearing variants of "no input" or empty input or cases where the input is already in the instruction.
+        # In this cases we don't add the input
+        if (
+            any([k in row["input"].lower() for k in ["no input", "noinput", "n/a"]])
+            or (not row["input"])
+            or (row["input"].lower() in row["instruction"].lower())
+        ):
+            return [row["instruction"], row["output"]]
+        # Concatenate the instruction and input.
+        else:
+            linking_char = random.choice(LINKING_CHARS)
+            return [f"{row['instruction']}{linking_char}{row['input']}", row["output"]]
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, index: int) -> list[str] | tuple[str]:
+        dialogue: list[str] = self.rows[index]
+        if self.mode == "sft":
+            return dialogue
+        elif self.mode == "rl":
+            return tuple(dialogue[:-1])