diff --git a/datasets/openwebtext/create_tfrecords.py b/datasets/openwebtext/create_tfrecords.py index 675a0a5..a095d76 100644 --- a/datasets/openwebtext/create_tfrecords.py +++ b/datasets/openwebtext/create_tfrecords.py @@ -7,18 +7,49 @@ import numpy as np import tensorflow as tf from tqdm import tqdm +from absl import app +from absl import flags import encoder -base_dir = "/home/connor/2/newspaper" # Path to where your .txt files are located -files_per = 175000 # 175000 ~ 200-300MB -name = "openwebtext-newspaper" # Name of output files will be name_i.tfrecords where i is the number of the file -output_dir = "/home/connor/out" -log_dir = "logs" -files = glob.glob(os.path.join(base_dir, "**/*.txt")) -processes = 64 # Number of encoding processes to run -encoder_path = "gs://openwebtext/stuff/encoder" # Path to encoder files -minimum_size = 25 +FLAGS = flags.FLAGS + + +flags.DEFINE_string( + "base_dir", + default="/home/connor/2/newspaper", + help="Path to where your .txt files are located.") + + +flags.DEFINE_string( + "output_dir", + default="/home/connor/out", + help="destination dir for tfrecords" + ) + +flags.DEFINE_string( + "encoder_path", + default="gs://openwebtext/stuff/encoder" , + help="Path to encoder files") + + +flags.DEFINE_string( + "name", + default="openwebtext-newspaper", + help="Name of output files will be name_i.tfrecords where i is the number of the file") + + +flags.DEFINE_integer("processes", + default=64, + help="Number of encoding processes to run") +flags.DEFINE_integer("minimum_size", + default=25, + help="minimum text size" + ) +flags.DEFINE_integer("files_per", + default=1500, + help="file chunk size") + def _int64_feature(value): """Returns an int64_list from a bool / enum / int / uint.""" @@ -38,11 +69,6 @@ def chunks(l, n): if not os.path.exists(log_dir): os.mkdir(log_dir) -enc = encoder.get_encoder(encoder_path) - -file_chunks = chunks(files, files_per) - -print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks)))) def create_file(args): i, chunk = args @@ -56,7 +82,7 @@ def create_file(args): good_files = 0 current = None for fn in chunk: - with tf.gfile.Open(fn, "r") as f: + with tf.io.gfile.Open(fn, "r") as f: d = f.read() d = ftfy.fix_text(d, normalization='NFKC') data = np.array(enc.encode(d), np.int32) @@ -78,12 +104,42 @@ def create_file(args): return good_files -start = time.time() -pool = Pool(processes=processes) -good = 0 -for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)): - good += g - -end = time.time() -print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files)))) +def main(argv ): + global enc + global files + + base_dir = FLAGS.base_dir ,# Path to where your .txt files are located + files_per =FLAGS.files_per ,# 175000 ~ 200-300MB + name = FLAGS.name, # Name of output files will be name_i.tfrecords where i is the number of the file + output_dir = FLAGS.output_dir, + log_dir = FLAGS.log_dir, + processes = FLAGS.processes, # Number of encoding processes to run + encoder_path =FLAGS.encoder_path ,# Path to encoder files + minimum_size = FLAGS.minimum_size + + + + + print(base_dir) + files = glob.glob(os.path.join(base_dir[0], "/**/*.txt"), recursive=True) + + + enc = encoder.get_encoder(encoder_path) + + file_chunks = chunks(files, files_per) + + print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks)))) + + start = time.time() + pool = Pool(processes=processes) + good = 0 + for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)): + good += g + + end = time.time() + + print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files)))) + +if __name__ == '__main__': + app.run(main) diff --git a/datasets/openwebtext/encoder.py b/datasets/openwebtext/encoder.py index 42f69e4..41b4217 100644 --- a/datasets/openwebtext/encoder.py +++ b/datasets/openwebtext/encoder.py @@ -107,12 +107,12 @@ def decode(self, tokens): return text def get_encoder(encoder_path): - with tf.gfile.Open(os.path.join(encoder_path, 'encoder.json'), 'r') as f: + with tf.io.gfile.GFile(os.path.join(encoder_path, 'encoder.json'), 'r') as f: encoder = json.load(f) - with tf.gfile.Open(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8? + with tf.io.gfile.GFile(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8? bpe_data = f.read() bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] return Encoder( encoder=encoder, bpe_merges=bpe_merges, -) \ No newline at end of file +)