Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Examples] TPU-based training of a language model using TensorFlow #21657

Merged
merged 29 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0684535
add: tokenizer training script for TF TPU LM training.
sayakpaul Feb 16, 2023
7b36763
add: script for preparing the TFRecord shards.
sayakpaul Feb 16, 2023
6a12cf2
add: sequence of execution to readme.
sayakpaul Feb 16, 2023
d6ddbb7
remove limit from the tfrecord shard name.
sayakpaul Feb 16, 2023
711ef60
Add initial train_model.py
Rocketknight1 Feb 21, 2023
24b9b25
Add basic training arguments and model init
Rocketknight1 Feb 22, 2023
f4656ef
Get up to the point of writing the data collator
Rocketknight1 Feb 22, 2023
126f021
Pushing progress so far!
Rocketknight1 Feb 22, 2023
14b4d9b
Complete first draft of model training code
Rocketknight1 Feb 27, 2023
af0aa28
feat: grouping of texts efficiently.
sayakpaul Feb 28, 2023
ad51abb
Add proper masking collator and get training loop working
Rocketknight1 Mar 8, 2023
95bef15
fix: things.
sayakpaul Mar 14, 2023
64c7d73
Read sample counts from filenames
Rocketknight1 Mar 20, 2023
e18f659
Read sample counts from filenames
Rocketknight1 Mar 20, 2023
c2ea2e1
Draft README
Rocketknight1 Mar 20, 2023
beeb897
Improve TPU warning
Rocketknight1 Mar 20, 2023
e2f9925
Use distribute instead of distribute.experimental
Rocketknight1 Mar 20, 2023
8456011
Apply suggestions from code review
sayakpaul Mar 21, 2023
6151870
Modularize loading and add MLM probability as arg
Rocketknight1 Mar 21, 2023
8d54835
Merge remote-tracking branch 'origin/examples/tf-tpu' into examples/t…
Rocketknight1 Mar 21, 2023
145981f
minor refactoring to better use the cli args.
sayakpaul Mar 25, 2023
ce3beec
Merge branch 'main' into examples/tf-tpu
sayakpaul Mar 25, 2023
b2e46de
readme fillup.
sayakpaul Mar 27, 2023
9ee6456
include tpu and inference sections in the readme.
sayakpaul Mar 27, 2023
46872bd
table of contents.
sayakpaul Mar 27, 2023
661cb92
parallelize maps.
sayakpaul Mar 27, 2023
21e5654
polish readme.
sayakpaul Apr 12, 2023
86a88ba
change script name to run_mlm.py
sayakpaul Apr 12, 2023
566a05d
address PR feedback (round I).
sayakpaul Apr 13, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/tensorflow/tpu/language-modeling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Detailed README TBA.

## Sequential execution of steps:

* `train_unigram.py`
* `prepare_tfrecord_shards.py`
142 changes: 142 additions & 0 deletions examples/tensorflow/tpu/language-modeling/prepare_tfrecord_shards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Script for preparing TFRecord shards for pre-tokenized examples."""

import argparse
import logging
import os

import datasets
import tensorflow as tf

from transformers import AutoTokenizer


logger = logging.getLogger(__name__)


def parse_args():
parser = argparse.ArgumentParser(
description="Prepare TFRecord shards from pre-tokenized samples of the wikitext dataset."
)
parser.add_argument(
"--tokenizer_name_or_path",
type=str,
default="sayakpaul/unigram-tokenizer-wikitext",
help="Tokenizer identifier. Can be a local filepath or a Hub identifier.",
)
parser.add_argument(
"--shard_size",
type=int,
default=1000,
help="Number of entries to go in a single shard.",
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should likely follow some advice from this guide to decide this number when running things at the full scale.

parser.add_argument("--split", type=str, default="train", choices=["train", "test", "validation"])
parser.add_argument(
"--limit",
default=None,
type=int,
help="Limit the number of shards (used for debugging).",
)
parser.add_argument(
"--max_length",
type=int,
default=128,
sayakpaul marked this conversation as resolved.
Show resolved Hide resolved
help="Maximum sequence length. For training on TPUs, it helps to have a maximum"
" sequence length that is a multiple of 8.",
)
parser.add_argument(
"--output_dir",
default="tf-tpu",
type=str,
help="Output directory where the TFRecord shards will be saved. If the"
" path is appended with `gs://` ('gs://tf-tpu', for example) then the TFRecord"
" shards will be directly saved to a Google Cloud Storage bucket.",
)

args = parser.parse_args()
return args


def get_serialized_examples(tokenizer):
def fn(examples, max_length=128):
tokenized_data = tokenizer(
examples,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="np",
)
records = []
for i in range(len(examples)):
features = {
"input_ids": tf.train.Feature(int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])),
"attention_mask": tf.train.Feature(
int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
),
}
features = tf.train.Features(feature=features)
example = tf.train.Example(features=features)
record_bytes = example.SerializeToString()
records.append(record_bytes)
return records

return fn


def main(args):
wikitext = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split=args.split)
sayakpaul marked this conversation as resolved.
Show resolved Hide resolved

if args.limit is not None:
max_samples = min(len(wikitext), args.limit)
wikitext = wikitext.select(range(max_samples))
logger.info(f"Limiting the dataset to {args.limit} entries.")

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)

# Handle output directory creation.
# For serializing into a Google Cloud Storage Bucket, one needs to first
# create a bucket.
if "gs" not in args.output_dir:
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
split_dir = os.path.join(args.output_dir, args.split)
if not os.path.exists(split_dir):
os.makedirs(split_dir)
else:
split_dir = os.path.join(args.output_dir, args.split)

shard_count = 0
get_serialized_examples_fn = get_serialized_examples(tokenizer)
for shard in range(0, len(wikitext), args.shard_size):
dataset_snapshot = wikitext[shard : shard + args.shard_size]["text"]
shard_size = len(dataset_snapshot)
filename = os.path.join(split_dir, f"wikitext-{shard_count}-{shard_size}.tfrecord")
serialized_examples = get_serialized_examples_fn(dataset_snapshot)

with tf.io.TFRecordWriter(filename) as out_file:
for i in range(shard_size):
example = serialized_examples[i]
out_file.write(example)
logger.info("Wrote file {} containing {} records".format(filename, shard_size))

shard_count += 1


if __name__ == "__main__":
args = parse_args()
main(args)
3 changes: 3 additions & 0 deletions examples/tensorflow/tpu/language-modeling/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
transformers==4.26.1
datasets==2.9.0
tokenizers==0.13.2
130 changes: 130 additions & 0 deletions examples/tensorflow/tpu/language-modeling/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Script for preparing TFRecord shards for pre-tokenized examples."""
sayakpaul marked this conversation as resolved.
Show resolved Hide resolved

import argparse
import logging

import tensorflow as tf

from transformers import AutoTokenizer, AutoConfig, TFAutoModelForMaskedLM


logger = logging.getLogger(__name__)


def parse_args():
parser = argparse.ArgumentParser(
description="Prepare TFRecord shards from pre-tokenized samples of the wikitext dataset."
)
parser.add_argument(
"--pretrained_model_config",
type=str,
default="roberta-base",
help="The model config to use. Note that we don't copy the model's weights, only the config!",
)
parser.add_argument(
"--tokenizer",
type=str,
default="unigram-tokenizer-wikitext",
help="The name of the tokenizer to load. We use the pretrained tokenizer to initialize the model's vocab size.",
)
parser.add_argument(
"--max_length",
type=int,
default=128,
help="Maximum sequence length. For training on TPUs, it helps to have a maximum"
" sequence length that is a multiple of 8.",
)
parser.add_argument(
"--output_dir",
default="tf-tpu",
type=str,
help="Output directory where the TFRecord shards will be saved. If the"
" path is appended with `gs://` ('gs://tf-tpu', for example) then the TFRecord"
" shards will be directly saved to a Google Cloud Storage bucket.",
)
parser.add_argument(
"--tpu_name",
type=str,
help="Name of TPU resource to initialize. Should be blank on Colab, and 'local' on TPU VMs."
)

parser.add_argument(
"--tpu_zone",
type=str,
help="Google cloud zone that TPU resource is located in. Only used for non-Colab TPU nodes."
)

parser.add_argument(
"--gcp_project",
type=str,
help="Google cloud project name. Only used for non-Colab TPU nodes."
)

parser.add_argument(
"--bfloat16",
action="store_true",
help="Use mixed-precision bfloat16 for training. This is the recommended lower-precision format for TPU."
)

args = parser.parse_args()
return args


def initialize_tpu(args):
try:
if args.tpu_name:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver(
args.tpu_name, zone=args.tpu_zone, project=args.gcp_project
)
else:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
except ValueError:
raise RuntimeError(f"Couldn't connect to TPU!")

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

return tpu


def main(args):
tpu = initialize_tpu(args)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

if args.bfloat16:
tf.keras.mixed_precision.set_global_policy("mixed_bfloat16")
sayakpaul marked this conversation as resolved.
Show resolved Hide resolved

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
config = AutoConfig.from_pretrained(args.pretrained_config)
config.vocab_size = tokenizer.vocab_size

with strategy.scope():
model = TFAutoModelForMaskedLM.from_config(config)
model(model.dummy_inputs) # Pass some dummy inputs through the model to ensure all the weights are built








if __name__ == "__main__":
args = parse_args()
main(args)
Loading