forked from google-research/albert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
checktfrecords.py
76 lines (63 loc) · 2.47 KB
/
checktfrecords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import tensorflow as tf
from tqdm import tqdm
import os
import sys
import numpy as np
import logging
import tokenization
max_seq_length = 512
max_predictions_per_seq = 20
tf.enable_eager_execution()
name_to_features = {
"input_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask": tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
# Note: We keep this feature name `next_sentence_labels` to be
# compatible with the original data created by lanzhzh@. However, in
# the ALBERT case it does represent sentence_order_labels.
"next_sentence_labels": tf.FixedLenFeature([1], tf.int64),
"masked_lm_positions": tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32)
}
tokenizer = tokenization.FullTokenizer("../30k/tokenizer.vocab", do_lower_case=False, spm_model_file="../30k/tokenizer.model")
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
BUCKET_NAME = "bertsweeu" #@param {type:"string"}
MODEL_DIR = "albert_test_plz" #@param {type:"string"}
PRETRAINING_DIR = "albert_data" #@param {type:"string"}
# Training procedure config
NUM_TPU_CORES = 8
if BUCKET_NAME:
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
BUCKET_PATH = "."
BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)
CONFIG_GCS_DIR = "{}/{}/{}".format(BUCKET_PATH, "albert_base", "albert_config.json")
input_files = tf.gfile.Glob(os.path.join("./test",'*tfrecord'))
input_file_cmd = ",".join(input_files)
input_files2 = input_files
print(len(input_files))
#cc458-2.txt.tfrecord
#cc459-1.txt.tfrecord
#cc464-1.txt.tfrecord
c = 0
for fn in tqdm(input_files2):
for record in tf.python_io.tf_record_iterator(fn):
c += 1
example = _decode_record(record, name_to_features)
print(tokenizer.convert_ids_to_tokens(example['input_ids'].numpy().tolist()))
print(example['segment_ids'].numpy().tolist())
print(example)
print("total:")
print(c)