From c8f191c541c65f70627ab3b93c5b961acd9cb0fa Mon Sep 17 00:00:00 2001 From: Steven I Reeves Date: Wed, 9 Jun 2021 21:24:42 +0000 Subject: [PATCH 1/2] Moving gpt2 finetuning script into transformers/script. --- scripts/gpt2-tf2/gpt2_train.py | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 scripts/gpt2-tf2/gpt2_train.py diff --git a/scripts/gpt2-tf2/gpt2_train.py b/scripts/gpt2-tf2/gpt2_train.py new file mode 100644 index 00000000000000..f72a1e3af7e163 --- /dev/null +++ b/scripts/gpt2-tf2/gpt2_train.py @@ -0,0 +1,71 @@ +import sys +import numpy as np +from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel +import tensorflow as tf +import jsonlines as jsonl + +def get_dataset(fil): + data = [] + with jsonl.open(fil) as reader: + for line in reader: + data.append(line['text']) + return data + +if len(sys.argv) == 1: + model_size = "Small" + data_dir = '/dockerx/data/' + num_epochs = 1 + truncate = True +else: + model_size = sys.argv[1] + data_dir = sys.argv[2] + num_epochs = int(sys.argv[3]) + if int(sys.argv[4]) == 1: + truncate = True + else: + truncate = False + +if model_size == "Small": + model_name = "gpt2" + train_file = data_dir+'small-117M-k40.train.jsonl' + valid_file = data_dir+'small-117M-k40.valid.jsonl' +elif model_size == "Medium": + model_name = "gpt2-medium" + train_file = data_dir+'medium-345M-k40.train.jsonl' + valid_file = data_dir+'medium-345M-k40.valid.jsonl' +elif model_size == "Large": + model_name = "gpt2-large" + train_file = data_dir+'large-762M-k40.train.jsonl' + valid_file = data_dir+'large-762M-k40.valid.jsonl' +elif model_size == "XL": + model_name = 'gpt2-xl' + train_file = data_dir+'xl-1542M-k40.train.jsonl' + valid_file = data_dir+'xl-1542M-k40.valid.jsonl' +print("Finetuning model " + model_name) +print("With dataset "+train_file) + +tokenizer = GPT2TokenizerFast.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token + +def tokenize(data, truncate=False): + if truncate: + data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True) + else: + data = tokenizer(data, return_tensors='tf', padding=True, truncation=True) + return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) + +print("========================= Loading dataset ========================") +train_dataset = tokenize(get_dataset(train_file), truncate) +valid_dataset = tokenize(get_dataset(valid_file), truncate) +print("============================ Loading model from pretrained ===========================") +model = TFGPT2LMHeadModel.from_pretrained(model_name) +optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) +loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +print("========================= Compiling Model ============================") +model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer]) +print("========================= Finetuning Model ==================================") +model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, validation_data=valid_dataset) +print("========================= Evaluating Model ==================================") +model.evaluate(valid_dataset) +print("========================= Saving Model ======================================") +model.save(model_name+'finetuned') From 65f52e307b3c7511cfa9375e4b915694190f5dfc Mon Sep 17 00:00:00 2001 From: Steven I Reeves Date: Thu, 10 Jun 2021 20:23:25 +0000 Subject: [PATCH 2/2] Validation now has accuracy measurement --- scripts/gpt2-tf2/gpt2_train.py | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/scripts/gpt2-tf2/gpt2_train.py b/scripts/gpt2-tf2/gpt2_train.py index f72a1e3af7e163..cebcb06a0623cd 100644 --- a/scripts/gpt2-tf2/gpt2_train.py +++ b/scripts/gpt2-tf2/gpt2_train.py @@ -2,8 +2,11 @@ import numpy as np from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel import tensorflow as tf +from tensorflow.keras import metrics import jsonlines as jsonl +BATCH_SIZE=1 + def get_dataset(fil): data = [] with jsonl.open(fil) as reader: @@ -27,26 +30,25 @@ def get_dataset(fil): if model_size == "Small": model_name = "gpt2" - train_file = data_dir+'small-117M-k40.train.jsonl' - valid_file = data_dir+'small-117M-k40.valid.jsonl' + train_file = data_dir+'small-117M.train.jsonl' + test_file = data_dir+'small-117M.test.jsonl' elif model_size == "Medium": model_name = "gpt2-medium" - train_file = data_dir+'medium-345M-k40.train.jsonl' - valid_file = data_dir+'medium-345M-k40.valid.jsonl' + train_file = data_dir+'medium-345M.train.jsonl' + test_file = data_dir+'medium-345M.test.jsonl' elif model_size == "Large": model_name = "gpt2-large" - train_file = data_dir+'large-762M-k40.train.jsonl' - valid_file = data_dir+'large-762M-k40.valid.jsonl' + train_file = data_dir+'large-762M.train.jsonl' + test_file = data_dir+'large-762M.test.jsonl' elif model_size == "XL": model_name = 'gpt2-xl' - train_file = data_dir+'xl-1542M-k40.train.jsonl' - valid_file = data_dir+'xl-1542M-k40.valid.jsonl' + train_file = data_dir+'xl-1542M.train.jsonl' + test_file = data_dir+'xl-1542M.test.jsonl' print("Finetuning model " + model_name) print("With dataset "+train_file) tokenizer = GPT2TokenizerFast.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token - def tokenize(data, truncate=False): if truncate: data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True) @@ -55,17 +57,20 @@ def tokenize(data, truncate=False): return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids'])) print("========================= Loading dataset ========================") -train_dataset = tokenize(get_dataset(train_file), truncate) -valid_dataset = tokenize(get_dataset(valid_file), truncate) +train_dataset = tokenize(get_dataset(train_file), truncate).shuffle(1000).batch(BATCH_SIZE) +test_dataset = tokenize(get_dataset(test_file), truncate).batch(BATCH_SIZE) print("============================ Loading model from pretrained ===========================") model = TFGPT2LMHeadModel.from_pretrained(model_name) +#Supresses the past_key_values from being expressed in the progress bar +model.config.use_cache=False optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +metric = metrics.SparseCategoricalAccuracy(name='Accuracy') print("========================= Compiling Model ============================") -model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer]) +model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]) print("========================= Finetuning Model ==================================") -model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, validation_data=valid_dataset) +model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset) print("========================= Evaluating Model ==================================") -model.evaluate(valid_dataset) -print("========================= Saving Model ======================================") -model.save(model_name+'finetuned') +info = model.evaluate(test_dataset, verbose=2) +#print("========================= Saving Model ======================================") +#model.save(model_name+'finetuned')