forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_generator.py
185 lines (163 loc) · 7.4 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
Download, preprocess and serve the TinyStories dataset as a DataLoader.
"""
import argparse
import glob
import json
import os
import random
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import torch
import torch.distributed as dist
from tqdm import tqdm
from tokenizer import Tokenizer
DATA_CACHE_DIR = "data"
TOKENIZER_DIR = "wiki"
def process_shard(args, vocab_size, data_name):
shard_id, shard = args
tokenizer_model = get_tokenizer_model_path(vocab_size)
enc = Tokenizer(tokenizer_model)
data = []
with open(shard, "r") as f:
for line in f:
d = json.loads(line)
data.append(d)
all_tokens = []
if data_name == "moss":
for example in tqdm(data, position=shard_id):
for talk in example["conversation"]:
query = talk["human"]
answer = talk["assistant"]
text = f"[INST] {query} [/INST] {answer}";
text = text.strip() # get rid of leading/trailing whitespace
tokens = enc.encode(text, bos=False, eos=True) # encode the text, use BOS
all_tokens.extend(tokens)
# convert to uint16 nparray
all_tokens = np.array(all_tokens, dtype=np.uint16)
# calculate the output filename
if vocab_size == 0:
# if we're using Llama 2, just save the tokenized file in the same dir
tokenized_filename = shard.replace(".jsonl", ".bin")
else:
# save .bin files into a new tok{N} directory
bin_dir = os.path.join(DATA_CACHE_DIR, data_name, f"tok{vocab_size}")
shard_basename = os.path.basename(shard)
bin_basename = shard_basename.replace(".jsonl", ".bin")
tokenized_filename = os.path.join(bin_dir, bin_basename)
# write the bytes
with open(tokenized_filename, "wb") as f:
f.write(all_tokens.tobytes())
# calculate the average sequence length (they are separated by EOS=2)
avg_seq_len = all_tokens.size / ((all_tokens == 2).sum())
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
def pretokenize(vocab_size, data_name):
# iterate the shards and tokenize all of them one by one
data_dir = os.path.join(DATA_CACHE_DIR, data_name)
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.jsonl")))
if vocab_size > 0:
# .bin files will be saved into tok{N} directory, create it once here
bin_dir = os.path.join(DATA_CACHE_DIR, data_name, f"tok{vocab_size}")
os.makedirs(bin_dir, exist_ok=True)
# process all the shards in a process pool
fun = partial(process_shard, vocab_size=vocab_size, data_name=data_name)
with ProcessPoolExecutor() as executor:
executor.map(fun, enumerate(shard_filenames))
print("Done.")
class PretokDataset(torch.utils.data.IterableDataset):
"""Loads pretokenized examples from disk and yields them as PyTorch tensors."""
def __init__(self, split, max_seq_len, vocab_size, vocab_source, tasks):
super().__init__()
self.split = split
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.vocab_source = vocab_source
self.tasks = tasks
def __iter__(self):
# get worker info within a DataLoader
worker_info = torch.utils.data.get_worker_info()
worker_id = worker_info.id if worker_info else 0
# get DDP rank info
rank = dist.get_rank() if dist.is_initialized() else 0
# combine the worker_id and worker_rank to create a unique seed for rng
seed = 42 + worker_id + 1337 * rank
rng = random.Random(seed)
print(f"Created a PretokDataset with rng seed {seed}")
shard_filenames = []
for data_name in self.tasks:
if self.vocab_source == "llama2":
# the .bin files are right along the .json files
bin_dir = os.path.join(DATA_CACHE_DIR, data_name)
sub_shards = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
elif self.vocab_source == "custom":
# the .bin files are in tok{N} directory
bin_dir = os.path.join(DATA_CACHE_DIR, data_name, f"tok{self.vocab_size}")
sub_shards = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
sub_shards = sub_shards[1:] if self.split == "train" else sub_shards[:1]
shard_filenames.extend(sub_shards)
# train/test split. let's use only shard 0 for test split, rest train
print(f"Total {len(shard_filenames)} bin files")
assert len(shard_filenames)>0, f"No bin files found in {bin_dir}"
while True:
shard = rng.choice(shard_filenames)
# open the dataset for reading but keep it on disk with memmap
m = np.memmap(shard, dtype=np.uint16, mode="r")
num_batches = (len(m) - 65536) // 256
assert num_batches > 0, "this shard is way too small? investigate."
ixs = list(range(num_batches))
rng.shuffle(ixs)
for ix in ixs:
end = ix * 256 + 65536
start = end - self.max_seq_len - 1
# calling .astype will copy the data into a new numpy array, now in RAM
chunk = torch.from_numpy((m[start:end]).astype(np.int64))
x = chunk[:-1]
y = chunk[1:]
yield x, y
# -----------------------------------------------------------------------------
# public interface functions
def get_tokenizer_model_path(vocab_size):
"""
Returns path to the sentencepiece tokenizer model for a given vocab size
vocab_size = 0 designates the default Llama 2 tokenizer, in that case
None is returned.
"""
if vocab_size == 0:
return None
else:
return os.path.join(DATA_CACHE_DIR, TOKENIZER_DIR, f"tok{vocab_size}.model")
class Task:
@staticmethod
def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
ds = PretokDataset(**dataset_kwargs)
dl = torch.utils.data.DataLoader(
ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
)
for x, y in dl:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
yield x, y
# -----------------------------------------------------------------------------
# CLI for constructing the dataset
if __name__ == "__main__":
"""
These stages are designed to be run in order.
To tokenize data with the Llama 2 tokenizer:
python tinystories.py download
python tinystories.py pretokenize
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
python tinystories.py download
python tinystories.py train_vocab --vocab_size=2048
python tinystories.py pretokenize --vocab_size=2048
"""
parser = argparse.ArgumentParser()
parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
parser.add_argument("--data_name", type=str, default="", help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
args = parser.parse_args()
# depending on the stage call the appropriate function
if args.stage == "pretokenize":
pretokenize(vocab_size=args.vocab_size, data_name=args.data_name)
else:
raise ValueError(f"Unknown stage {args.stage}")