Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update create_tfrecords.py #17

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
102 changes: 79 additions & 23 deletions datasets/openwebtext/create_tfrecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,49 @@
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from absl import app
from absl import flags

import encoder

base_dir = "/home/connor/2/newspaper" # Path to where your .txt files are located
files_per = 175000 # 175000 ~ 200-300MB
name = "openwebtext-newspaper" # Name of output files will be name_i.tfrecords where i is the number of the file
output_dir = "/home/connor/out"
log_dir = "logs"
files = glob.glob(os.path.join(base_dir, "**/*.txt"))
processes = 64 # Number of encoding processes to run
encoder_path = "gs://openwebtext/stuff/encoder" # Path to encoder files
minimum_size = 25
FLAGS = flags.FLAGS


flags.DEFINE_string(
"base_dir",
default="/home/connor/2/newspaper",
help="Path to where your .txt files are located.")


flags.DEFINE_string(
"output_dir",
default="/home/connor/out",
help="destination dir for tfrecords"
)

flags.DEFINE_string(
"encoder_path",
default="gs://openwebtext/stuff/encoder" ,
help="Path to encoder files")


flags.DEFINE_string(
"name",
default="openwebtext-newspaper",
help="Name of output files will be name_i.tfrecords where i is the number of the file")


flags.DEFINE_integer("processes",
default=64,
help="Number of encoding processes to run")
flags.DEFINE_integer("minimum_size",
default=25,
help="minimum text size"
)
flags.DEFINE_integer("files_per",
default=1500,
help="file chunk size")


def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
Expand All @@ -38,11 +69,6 @@ def chunks(l, n):
if not os.path.exists(log_dir):
os.mkdir(log_dir)

enc = encoder.get_encoder(encoder_path)

file_chunks = chunks(files, files_per)

print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks))))

def create_file(args):
i, chunk = args
Expand All @@ -56,7 +82,7 @@ def create_file(args):
good_files = 0
current = None
for fn in chunk:
with tf.gfile.Open(fn, "r") as f:
with tf.io.gfile.Open(fn, "r") as f:
d = f.read()
d = ftfy.fix_text(d, normalization='NFKC')
data = np.array(enc.encode(d), np.int32)
Expand All @@ -78,12 +104,42 @@ def create_file(args):

return good_files

start = time.time()
pool = Pool(processes=processes)
good = 0
for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)):
good += g

end = time.time()

print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files))))
def main(argv ):
global enc
global files

base_dir = FLAGS.base_dir ,# Path to where your .txt files are located
files_per =FLAGS.files_per ,# 175000 ~ 200-300MB
name = FLAGS.name, # Name of output files will be name_i.tfrecords where i is the number of the file
output_dir = FLAGS.output_dir,
log_dir = FLAGS.log_dir,
processes = FLAGS.processes, # Number of encoding processes to run
encoder_path =FLAGS.encoder_path ,# Path to encoder files
minimum_size = FLAGS.minimum_size




print(base_dir)
files = glob.glob(os.path.join(base_dir[0], "/**/*.txt"), recursive=True)


enc = encoder.get_encoder(encoder_path)

file_chunks = chunks(files, files_per)

print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks))))

start = time.time()
pool = Pool(processes=processes)
good = 0
for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)):
good += g

end = time.time()

print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files))))

if __name__ == '__main__':
app.run(main)
6 changes: 3 additions & 3 deletions datasets/openwebtext/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,12 @@ def decode(self, tokens):
return text

def get_encoder(encoder_path):
with tf.gfile.Open(os.path.join(encoder_path, 'encoder.json'), 'r') as f:
with tf.io.gfile.GFile(os.path.join(encoder_path, 'encoder.json'), 'r') as f:
encoder = json.load(f)
with tf.gfile.Open(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8?
with tf.io.gfile.GFile(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8?
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
)
)