Skip to content

Commit

Permalink
Merge pull request huggingface#7 from ROCmSoftwarePlatform/gpt2-tf2
Browse files Browse the repository at this point in the history
Updating GPT2-TF2 Scripts
  • Loading branch information
stevenireeves authored Jun 29, 2021
2 parents ee5302e + 95080f2 commit bd12e8b
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 4 deletions.
58 changes: 58 additions & 0 deletions scripts/gpt2-tf2/gpt2_1step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys

import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics

BATCH_SIZE=1

def get_dataset(fil):
data = []
with jsonl.open(fil) as reader:
for line in reader:
data.append(line['text'])
return data

if len(sys.argv) == 1:
model_size = "Small"
data_dir = '/dockerx/data/'
else:
model_size = sys.argv[1]
data_dir = sys.argv[2]

if model_size == "Small":
model_name = "gpt2"
train_file = data_dir+'small-117M.train.jsonl'
test_file = data_dir+'small-117M.test.jsonl'
elif model_size == "Medium":
model_name = "gpt2-medium"
train_file = data_dir+'medium-345M.train.jsonl'
test_file = data_dir+'medium-345M.test.jsonl'
elif model_size == "Large":
model_name = "gpt2-large"
train_file = data_dir+'large-762M.train.jsonl'
test_file = data_dir+'large-762M.test.jsonl'
elif model_size == "XL":
model_name = 'gpt2-xl'
train_file = data_dir+'xl-1542M.train.jsonl'
test_file = data_dir+'xl-1542M.test.jsonl'
print("Profiling model " + model_name)

tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize(data):
data = tokenizer(data[0], return_tensors='tf', padding=True, truncation=True)
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))

train_dataset = tokenize(get_dataset(train_file)).batch(BATCH_SIZE)
model = TFGPT2LMHeadModel.from_pretrained(model_name)
#Supresses the past_key_values from being expressed in the progress bar
model.config.use_cache=False
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer])
model.fit(train_dataset, batch_size=1, epochs=1)

5 changes: 5 additions & 0 deletions scripts/gpt2-tf2/gpt2_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import sys
import pandas as pd
profile_dir = sys.argv[1]
df = pd.read_csv(profile_dir+'results.stats.csv')
print('Total time for one step GPT2', sum(df["TotalDurationNs"])*1e-9, 's')
7 changes: 3 additions & 4 deletions scripts/gpt2-tf2/gpt2_train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys
import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics
import jsonlines as jsonl

BATCH_SIZE=1

Expand Down Expand Up @@ -69,8 +69,7 @@ def tokenize(data, truncate=False):
print("========================= Compiling Model ============================")
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
print("========================= Finetuning Model ==================================")
model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset)
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
print("========================= Evaluating Model ==================================")
info = model.evaluate(test_dataset, verbose=2)
#print("========================= Saving Model ======================================")
#model.save(model_name+'finetuned')

82 changes: 82 additions & 0 deletions scripts/gpt2-tf2/gpt2_train_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import sys
import numpy as np
import jsonlines as jsonl
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
import tensorflow as tf
from tensorflow.keras import metrics


def get_dataset(fil):
data = []
with jsonl.open(fil) as reader:
for line in reader:
data.append(line['text'])
return data

if len(sys.argv) == 1:
model_size = "Small"
data_dir = '/dockerx/data/tf-gpt-2/data/'
num_epochs = 1
num_gpus = len(tf.config.list_physical_devices(device_type='GPU'))
truncate = True
else:
model_size = sys.argv[1]
data_dir = sys.argv[2]
num_epochs = int(sys.argv[3])
num_gpus = int(sys.argv[4])
if int(sys.argv[5]) == 1:
truncate = True
else:
truncate = False

if model_size == "Small":
model_name = "gpt2"
train_file = data_dir+'small-117M-k40.train.jsonl'
valid_file = data_dir+'small-117M-k40.valid.jsonl'
elif model_size == "Medium":
model_name = "gpt2-medium"
train_file = data_dir+'medium-345M-k40.train.jsonl'
valid_file = data_dir+'medium-345M-k40.valid.jsonl'
elif model_size == "Large":
model_name = "gpt2-large"
train_file = data_dir+'large-762M-k40.train.jsonl'
valid_file = data_dir+'large-762M-k40.valid.jsonl'
elif model_size == "XL":
model_name = 'gpt2-xl'
train_file = data_dir+'xl-1542M-k40.train.jsonl'
valid_file = data_dir+'xl-1542M-k40.valid.jsonl'
print("Finetuning model " + model_name)
print("With dataset "+train_file)

def tokenize(data, tokenizer, truncate=False):
if truncate:
data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True)
else:
data = tokenizer(data, return_tensors='tf', padding=True, truncation=True)
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))

print("============================ Creating Distributed Strategy ===========================")
devices = []
for i in range(num_gpus):
devices.append("GPU:"+str(i))
strategy = tf.distribute.MirroredStrategy(devices=devices)
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
print("============================ Loading model from pretrained and compiling ===========================")
with strategy.scope():
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
print("========================= Loading dataset ========================")
train_dataset = tokenize(get_dataset(train_file),tokenizer, truncate).batch(num_gpus)
valid_dataset = tokenize(get_dataset(valid_file),tokenizer, truncate).batch(num_gpus)
model = TFGPT2LMHeadModel.from_pretrained(model_name)
#Disable past key values
model.config.use_cache=False
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
print("========================= Finetuning Model ==================================")
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
print("========================= Evaluating Model ==================================")
model.evaluate(valid_dataset)

7 changes: 7 additions & 0 deletions scripts/gpt2-tf2/profile_gpt2_train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
model_size=$1
echo $model_size
model_dir=$2
profile_dir=$3
rocprof --stats python3 gpt2_1step.py $model_size $model_dir
python3 gpt2_profile.py $profile_dir

0 comments on commit bd12e8b

Please sign in to comment.