-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsentence.py
154 lines (134 loc) · 4.2 KB
/
sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import cipheycore
import tensorflow as tf
from tensorflow.keras.layers import (
Activation,
Conv1D,
Dense,
Dropout,
Flatten,
MaxPooling2D,
Reshape,
Input
)
import numpy
from tensorflow.keras.models import Sequential, load_model
import random
import sys
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.optimizer_v2.adam import Adam
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8)
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="/tmp/tflog")
width = 512
def make_model():
m = Sequential()
m.add(Input((width,)))
for i in range(1, 4):
m.add(Dense(512, activation="relu"))
m.add(Dropout(0.2))
for i in range(1, 4):
m.add(Dense(512, activation="relu"))
for i in range(1, 4):
m.add(Dense(256, activation="relu"))
m.add(Dense(1, activation="sigmoid"))
m.compile(
optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"],
)
return m
# model = make_model()
# model.save("/tmp/model")
# exit(0)
def str_conv(*args):
ret = numpy.zeros((len(args), width), 'int32')
for arg_index in range(0, len(args)):
s = args[arg_index]
for index in range(0, len(s) - 1):
ret[arg_index][index] = ord(s[index])
return ret
model = tf.keras.models.load_model("/mnt/bigs/model")
if len(sys.argv) > 1:
print(model.predict(str_conv(*sys.argv[1:2]))[0][0])
exit(0)
hansard = []
twists = []
wikis = []
sentences = []
#/usr/share/dict/words
with open("/mnt/bigs/hansard.txt", "r", encoding="cp1252") as f:
hansard = f.read().splitlines()
with open("/mnt/bigs/twist.2.txt", "r", encoding="utf8") as f:
twists = f.read().splitlines()
# with open("/mnt/bigs/wiki-links/all.txt", "r", encoding="utf8") as f:
# twists = f.read().splitlines()
with open("/mnt/bigs/data/benchmark-v1.0/sentences.txt", "r", encoding="utf8") as f:
sentences = random.sample(f.read().splitlines(), 1000000)
wordlists = [sentences, hansard, twists]
print(f"Loaded {len(wordlists)} datasets")
lens = dict()
data_size = 0
analysis = cipheycore.start_analysis()
for list in wordlists:
for word in list:
cipheycore.continue_analysis(analysis, word)
lens.setdefault(len(word), 0)
lens[len(word)] += 1
data_size += 1
cipheycore.finish_analysis(analysis)
print(f"Analysed frequencies")
"""
model = Sequential()
model.add(Input((width,)))
for i in range(1, 4):
model.add(Dense(512, activation="relu"))
model.add(Dropout(0.2))
for i in range(1, 4):
model.add(Dense(256, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(
optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"],
)
"""
def generate_ds(number: int):
def rand_word_len():
pos = random.randint(0, data_size)
for elem, count in lens.items():
pos -= count
if pos <= 0:
return elem
raise 0
fuzz_rate = 0.5
ds_x = numpy.zeros((number, width), 'int32')
ds_y = numpy.empty(number)
for i in range(0, number - 1):
if random.uniform(0, 1) < fuzz_rate:
sample_word, = cipheycore.fuzz(analysis, rand_word_len()),
ds_y[i] = 0
else:
wl = random.choice(wordlists)
sample_word = random.choice(wl)
ds_y[i] = 1
if len(sample_word) > width:
continue
for j in range(0, len(sample_word) - 1):
ds_x[i][j] = ord(sample_word[j])
if (i % (number // 100)) == 0:
print(f"generating dataset {(i / number) * 100}% complete")
return ds_x, ds_y
stop_threshold = 0.995
opt = Adam(lr=0.00001) # , decay=.02
model.compile(
optimizer=opt,
loss="binary_crossentropy",
metrics=["accuracy"],
)
while True:
ds_x,ds_y = generate_ds(1000000)
es = EarlyStopping(monitor='val_accuracy', baseline=stop_threshold)
res = model.fit(ds_x, ds_y, callbacks=[], epochs=16, validation_split=0.2, batch_size=1024) #
model.save("/mnt/bigs/model")
if res.history['val_accuracy'][-1] > stop_threshold:
break