Skip to content

Commit

Permalink
add alpaca gpt4 dataset (LAION-AI#2610)
Browse files Browse the repository at this point in the history
The inputs can be quite a lot of different versions of `no input`,
therefore don't use the `input` column for that.
In some cases the text in `input` is already in the instruction, in
these cases, we also don't use the `input` column.

I am not quite sure how to concatenate the `instruction` and the `input`
column. In most cases it seems fine to just replace last appearance of
`.`, `!` or `?` with a colon, e.g.:
Instruction: `Identify the odd one out.`
Input: `Twitter, Instagram, Telegram`
or 
Instruction: `How dense is a given material?`
Input: `Steel`

But we also have some questions like:
Instruction: `Given the following synopsis, what is the moral lesson of
this story?`
Input: `Once upon a time, there was a poor young boy who wanted some
candy. He begged his father for money to buy it, but his father said no
and ordered him to go to bed. As he was going to bed, the boy saw a
five-dollar bill on the counter, which he took and bought the candy.`

Where this might not be the best case. Either way, I think the this one
token will not make significant difference the model and therefore I
just concatenate instruction and input with a space.
  • Loading branch information
CloseChoice authored Apr 20, 2023
1 parent cd71d2a commit b9c60ed
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
3 changes: 3 additions & 0 deletions model/model_training/custom_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from model_training.custom_datasets.prompt_dialogue import Gpt4All, load_oig_file
from model_training.custom_datasets.qa_datasets import (
SODA,
AlpacaGpt4,
DatabricksDolly15k,
JokeExplaination,
QADataset,
Expand Down Expand Up @@ -162,6 +163,8 @@ def get_one_dataset(
train, eval = load_hellaswag()
elif dataset_name == "dolly15k":
dataset = DatabricksDolly15k(cache_dir=data_path)
elif dataset_name == "alpaca_gpt4":
dataset = AlpacaGpt4(cache_dir=data_path, **kwargs)
else:
raise ValueError(f"Unknown dataset {dataset_name}")

Expand Down
45 changes: 45 additions & 0 deletions model/model_training/custom_datasets/qa_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import glob
import json
import os
import random
import re
from collections import defaultdict
from pathlib import Path
Expand All @@ -20,6 +21,9 @@
re_reference_remove = re.compile(r"\[\d+(?:,\s*\d+)*?\]")


LINKING_CHARS = ["\n", "\n\n", " "]


def index_squad_v2(example):
if len(example["answers"]["text"]):
answer = example["answers"]["text"][0]
Expand Down Expand Up @@ -572,3 +576,44 @@ def __getitem__(self, index: int) -> list[str] | tuple[str]:
return dialogue
elif self.mode == "rl":
return tuple(dialogue[:-1])


class AlpacaGpt4(Dataset):
def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 2048) -> None:
super().__init__()
self.rows = []
if mode not in ("sft", "rl"):
raise NotImplementedError(f"Currently only the modes 'sft' and 'rl' are implemented. Received {mode}.")
self.mode = mode
data = load_dataset("vicgalle/alpaca-gpt4", cache_dir=cache_dir)
for line in data["train"]:
if (conv := self._process_instruction(line, input_max_length)) is not None:
self.rows.append(conv)

def _process_instruction(self, row: dict[str, str], input_max_length: int) -> list[str] | None:
# discard items that are too long: when checked on 2023-04-17 this was just one item in the whole dataset with length above 2048.
# And 12 above 1024.
if len(row["input"]) + len(row["instruction"]) > input_max_length:
return None
# filter all appearing variants of "no input" or empty input or cases where the input is already in the instruction.
# In this cases we don't add the input
if (
any([k in row["input"].lower() for k in ["no input", "noinput", "n/a"]])
or (not row["input"])
or (row["input"].lower() in row["instruction"].lower())
):
return [row["instruction"], row["output"]]
# Concatenate the instruction and input.
else:
linking_char = random.choice(LINKING_CHARS)
return [f"{row['instruction']}{linking_char}{row['input']}", row["output"]]

def __len__(self) -> int:
return len(self.rows)

def __getitem__(self, index: int) -> list[str] | tuple[str]:
dialogue: list[str] = self.rows[index]
if self.mode == "sft":
return dialogue
elif self.mode == "rl":
return tuple(dialogue[:-1])

0 comments on commit b9c60ed

Please sign in to comment.