From 316aa1c840e0c70c475d60b7c5371d7850b94336 Mon Sep 17 00:00:00 2001 From: Cheng Guo Date: Tue, 25 Apr 2023 09:55:35 +0200 Subject: [PATCH] Add two popular datasets for character level LM --- config/train_enwik8.py | 38 +++++++++++++++++++++ config/train_text8.py | 38 +++++++++++++++++++++ data/enwik8/prepare.py | 75 +++++++++++++++++++++++++++++++++++++++++ data/text8/prepare.py | 76 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 config/train_enwik8.py create mode 100644 config/train_text8.py create mode 100644 data/enwik8/prepare.py create mode 100644 data/text8/prepare.py diff --git a/config/train_enwik8.py b/config/train_enwik8.py new file mode 100644 index 0000000000..07f0120408 --- /dev/null +++ b/config/train_enwik8.py @@ -0,0 +1,38 @@ +# train a character-level model on enwik8 + +out_dir = "out-enwik8" +eval_interval = 1000 +eval_iters = 200 +log_interval = 100 # don't print too too often + +# only save when val improves +always_save_checkpoint = False + +# wandb_log = True # override via command line if you like +# wandb_project = 'nanogpt' +# wandb_run_name = 'enwik8' + +dataset = "enwik8" +gradient_accumulation_steps = 1 +batch_size = 32 +block_size = 256 # context of up to 256 previous characters + +# baby GPT model :) +n_layer = 6 +n_head = 6 +n_embd = 512 +dropout = 0.2 + +learning_rate = 5e-4 +max_iters = 100000 +lr_decay_iters = max_iters # make equal to max_iters usually +min_lr = 5e-5 # learning_rate / 10 usually +beta2 = 0.99 + +warmup_iters = 200 # not super necessary potentially + +# on macbook also add +# device = 'cpu' # run on cpu only +compile = True # do not torch compile the model +# init_from = 'resume' +# eval_only = True diff --git a/config/train_text8.py b/config/train_text8.py new file mode 100644 index 0000000000..82e0525cef --- /dev/null +++ b/config/train_text8.py @@ -0,0 +1,38 @@ +# train a character-level model on text8 + +out_dir = "out-text8" +eval_interval = 1000 +eval_iters = 200 +log_interval = 100 # don't print too too often + +# only save when val improves +always_save_checkpoint = False + +# wandb_log = True # override via command line if you like +# wandb_project = 'nanogpt' +# wandb_run_name = 'text8' + +dataset = "text8" +gradient_accumulation_steps = 1 +batch_size = 32 +block_size = 256 # context of up to 256 previous characters + +# baby GPT model :) +n_layer = 6 +n_head = 6 +n_embd = 512 +dropout = 0.2 + +learning_rate = 5e-4 +max_iters = 100000 +lr_decay_iters = max_iters # make equal to max_iters usually +min_lr = 5e-5 # learning_rate / 10 usually +beta2 = 0.99 + +warmup_iters = 200 # not super necessary potentially + +# on macbook also add +# device = 'cpu' # run on cpu only +compile = True # do not torch compile the model +# init_from = 'resume' +# eval_only = True diff --git a/data/enwik8/prepare.py b/data/enwik8/prepare.py new file mode 100644 index 0000000000..e82f3d3416 --- /dev/null +++ b/data/enwik8/prepare.py @@ -0,0 +1,75 @@ +""" +Prepare the enwik8 dataset for character-level language modeling. +So instead of encoding with GPT-2 BPE tokens, we just map characters to ints. +Will save train.bin, val.bin containing the ids, and meta.pkl containing the +encoder and decoder and some other related info. +""" +import os +import pickle +import requests +import numpy as np + +# download the enwik8 dataset +input_file_path = os.path.join(os.path.dirname(__file__), 'enwik8') +if not os.path.exists(input_file_path): + data_url = 'http://mattmahoney.net/dc/enwik8.zip' + r = requests.get(data_url) + with open(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'wb') as f: + f.write(r.content) + + # unzip the enwik8 dataset + import zipfile + with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'r') as zip_ref: + zip_ref.extractall(os.path.dirname(__file__)) + +with open(input_file_path, 'r', encoding='latin-1') as f: + data = f.read() +print(f"length of dataset in characters: {len(data):,}") + +# get all the unique characters that occur in this text +chars = sorted(list(set(data))) +vocab_size = len(chars) +print("all the unique characters:", ''.join(chars)) +print(f"vocab size: {vocab_size:,}") + +# create a mapping from characters to integers +stoi = { ch:i for i,ch in enumerate(chars) } +itos = { i:ch for i,ch in enumerate(chars) } +def encode(s): + return [stoi[c] for c in s] # encoder: take a string, output a list of integers +def decode(l): + return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string + +# create the train, validation, and test splits +n = len(data) +num_test_chars = 5000000 +train_data = data[: -2 * num_test_chars] +val_data = data[-2 * num_test_chars: -num_test_chars] +test_data = data[-num_test_chars:] + +# encode all splits to integers +train_ids = encode(train_data) +val_ids = encode(val_data) +test_ids = encode(test_data) + +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") +print(f"test has {len(test_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +test_ids = np.array(test_ids, dtype=np.uint16) + +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) +test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin')) + +# save the meta information as well, to help us encode/decode later +meta = { + 'vocab_size': vocab_size, + 'itos': itos, + 'stoi': stoi, +} +with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: + pickle.dump(meta, f) diff --git a/data/text8/prepare.py b/data/text8/prepare.py new file mode 100644 index 0000000000..f15a456685 --- /dev/null +++ b/data/text8/prepare.py @@ -0,0 +1,76 @@ +""" +Prepare the text8 dataset for character-level language modeling. +So instead of encoding with GPT-2 BPE tokens, we just map characters to ints. +Will save train.bin, val.bin containing the ids, and meta.pkl containing the +encoder and decoder and some other related info. +""" +import os +import pickle +import requests +import numpy as np + +# download the text8 dataset +input_file_path = os.path.join(os.path.dirname(__file__), 'text8') +if not os.path.exists(input_file_path): + data_url = 'http://mattmahoney.net/dc/text8.zip' + r = requests.get(data_url) + with open(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'wb') as f: + f.write(r.content) + + # unzip the text8 dataset + import zipfile + with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'r') as zip_ref: + zip_ref.extractall(os.path.dirname(__file__)) + +with open(input_file_path, 'r') as f: + data = f.read() +print(f"length of dataset in characters: {len(data):,}") + +# get all the unique characters that occur in this text +chars = sorted(list(set(data))) +vocab_size = len(chars) +print("all the unique characters:", ''.join(chars)) +print(f"vocab size: {vocab_size:,}") + +# create a mapping from characters to integers +stoi = { ch:i for i,ch in enumerate(chars) } +itos = { i:ch for i,ch in enumerate(chars) } +def encode(s): + return [stoi[c] for c in s] # encoder: take a string, output a list of integers +def decode(l): + return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string + +# create the train, validation, and test splits +n = len(data) +num_test_chars = 5000000 +train_data = data[: -2 * num_test_chars] +val_data = data[-2 * num_test_chars: -num_test_chars] +test_data = data[-num_test_chars:] + +# encode all splits to integers +train_ids = encode(train_data) +val_ids = encode(val_data) +test_ids = encode(test_data) + +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") +print(f"test has {len(test_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +test_ids = np.array(test_ids, dtype=np.uint16) + +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) +test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin')) + +# save the meta information as well, to help us encode/decode later +meta = { + 'vocab_size': vocab_size, + 'itos': itos, + 'stoi': stoi, +} +with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: + pickle.dump(meta, f) +