-
Notifications
You must be signed in to change notification settings - Fork 12
/
random_data_generator.py
65 lines (55 loc) · 1.98 KB
/
random_data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
"""random_data_generator: """
import math
import h5py
import numpy as np
__author__ = "Yandi Xia"
def write_hdf5_file(features, labels, hdf5_output):
"""
func:
output features and labels into hdf5 file
:param hdf5_output:
:return:
"""
f = h5py.File(hdf5_output, "w")
dset = f.create_dataset("text", shape=features.shape, dtype="int64")
dset[...] = features
dset = f.create_dataset("label", shape=labels.shape, dtype="int64")
dset[...] = labels
f.close()
def generate_batch_of_random_data_files(vocab_size,
label_size,
max_len,
num_instances,
num_split,
hdf5_output_prefix):
"""
func:
generate a batch of data files
:param vocab_size:
each word index is in range(0, vocab_size)
:param label_size:
each label index is in range(0, label_size)
:param max_len:
the shape of features is [num_instances, max_len]
:param num_instances:
:param num_split:
how many files in the batch
:param hdf5_output_prefix:
the prefix that the files share
:return:
a list of file names
"""
rng = np.random.RandomState(1234)
features = rng.randint(0, vocab_size, (num_instances, max_len), dtype="int64")
labels = rng.randint(0, label_size, (num_instances,), dtype="int64")
data_size = int(math.ceil(num_instances / num_split))
file_suffix = ".hdf5"
file_list = []
for i in xrange(num_split):
file_name = hdf5_output_prefix + str(i) + file_suffix
write_hdf5_file(features=features[i * data_size: (i + 1) * data_size, :],
labels=labels[i * data_size: (i + 1) * data_size],
hdf5_output=file_name)
file_list.append(file_name)
return file_list