From 316aa1c840e0c70c475d60b7c5371d7850b94336 Mon Sep 17 00:00:00 2001
From: Cheng Guo <cheng.guo.work@gmail.com>
Date: Tue, 25 Apr 2023 09:55:35 +0200
Subject: [PATCH] Add two popular datasets for character level LM

---
 config/train_enwik8.py | 38 +++++++++++++++++++++
 config/train_text8.py  | 38 +++++++++++++++++++++
 data/enwik8/prepare.py | 75 +++++++++++++++++++++++++++++++++++++++++
 data/text8/prepare.py  | 76 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 227 insertions(+)
 create mode 100644 config/train_enwik8.py
 create mode 100644 config/train_text8.py
 create mode 100644 data/enwik8/prepare.py
 create mode 100644 data/text8/prepare.py

diff --git a/config/train_enwik8.py b/config/train_enwik8.py
new file mode 100644
index 0000000000..07f0120408
--- /dev/null
+++ b/config/train_enwik8.py
@@ -0,0 +1,38 @@
+# train a character-level model on enwik8
+
+out_dir = "out-enwik8"
+eval_interval = 1000
+eval_iters = 200
+log_interval = 100  # don't print too too often
+
+# only save when val improves
+always_save_checkpoint = False
+
+# wandb_log = True # override via command line if you like
+# wandb_project = 'nanogpt'
+# wandb_run_name = 'enwik8'
+
+dataset = "enwik8"
+gradient_accumulation_steps = 1
+batch_size = 32
+block_size = 256  # context of up to 256 previous characters
+
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 512
+dropout = 0.2
+
+learning_rate = 5e-4
+max_iters = 100000
+lr_decay_iters = max_iters  # make equal to max_iters usually
+min_lr = 5e-5  # learning_rate / 10 usually
+beta2 = 0.99
+
+warmup_iters = 200  # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+compile = True  # do not torch compile the model
+# init_from = 'resume'
+# eval_only = True
diff --git a/config/train_text8.py b/config/train_text8.py
new file mode 100644
index 0000000000..82e0525cef
--- /dev/null
+++ b/config/train_text8.py
@@ -0,0 +1,38 @@
+# train a character-level model on text8
+
+out_dir = "out-text8"
+eval_interval = 1000
+eval_iters = 200
+log_interval = 100  # don't print too too often
+
+# only save when val improves
+always_save_checkpoint = False
+
+# wandb_log = True # override via command line if you like
+# wandb_project = 'nanogpt'
+# wandb_run_name = 'text8'
+
+dataset = "text8"
+gradient_accumulation_steps = 1
+batch_size = 32
+block_size = 256  # context of up to 256 previous characters
+
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 512
+dropout = 0.2
+
+learning_rate = 5e-4
+max_iters = 100000
+lr_decay_iters = max_iters  # make equal to max_iters usually
+min_lr = 5e-5  # learning_rate / 10 usually
+beta2 = 0.99
+
+warmup_iters = 200  # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+compile = True  # do not torch compile the model
+# init_from = 'resume'
+# eval_only = True
diff --git a/data/enwik8/prepare.py b/data/enwik8/prepare.py
new file mode 100644
index 0000000000..e82f3d3416
--- /dev/null
+++ b/data/enwik8/prepare.py
@@ -0,0 +1,75 @@
+"""
+Prepare the enwik8 dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+
+# download the enwik8 dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'enwik8')
+if not os.path.exists(input_file_path):
+    data_url = 'http://mattmahoney.net/dc/enwik8.zip'
+    r = requests.get(data_url)
+    with open(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'wb') as f:
+        f.write(r.content)
+
+    # unzip the enwik8 dataset
+    import zipfile
+    with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'r') as zip_ref:
+        zip_ref.extractall(os.path.dirname(__file__))
+
+with open(input_file_path, 'r', encoding='latin-1') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train, validation, and test splits
+n = len(data)
+num_test_chars = 5000000
+train_data = data[: -2 * num_test_chars]
+val_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+# encode all splits to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+test_ids = encode(test_data)
+
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+print(f"test has {len(test_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+test_ids = np.array(test_ids, dtype=np.uint16)
+
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
diff --git a/data/text8/prepare.py b/data/text8/prepare.py
new file mode 100644
index 0000000000..f15a456685
--- /dev/null
+++ b/data/text8/prepare.py
@@ -0,0 +1,76 @@
+"""
+Prepare the text8 dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+
+# download the text8 dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'text8')
+if not os.path.exists(input_file_path):
+    data_url = 'http://mattmahoney.net/dc/text8.zip'
+    r = requests.get(data_url)
+    with open(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'wb') as f:
+        f.write(r.content)
+
+    # unzip the text8 dataset
+    import zipfile
+    with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'r') as zip_ref:
+        zip_ref.extractall(os.path.dirname(__file__))
+
+with open(input_file_path, 'r') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train, validation, and test splits
+n = len(data)
+num_test_chars = 5000000
+train_data = data[: -2 * num_test_chars]
+val_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+# encode all splits to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+test_ids = encode(test_data)
+
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+print(f"test has {len(test_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+test_ids = np.array(test_ids, dtype=np.uint16)
+
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
+