forked from steciuk/AAL-computational-complexity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthetic.py
106 lines (77 loc) · 3.08 KB
/
synthetic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#
# Lukasz Pokorzyński, Adam Steciuk
# synthetic.py - words generator
#
import re
import argparse
from collections import defaultdict
import random
class Dictionaries:
dictionary = defaultdict(lambda: defaultdict(int))
first_letters = defaultdict(int)
def words(filename, num, seed=None):
if seed is not None:
random.seed(seed)
training_words = read_file(filename)
dictionaries = create_dictionary(training_words)
word_list = generate_words(dictionaries.first_letters, dictionaries.dictionary, num)
return word_list
def read_file(filename):
training_words = []
with open(filename, 'r', encoding='utf8') as f:
for line in f:
for word in re.split('\W+', line):
word = word.lower()
training_words.append(word)
training_words.pop(-1)
return training_words
def create_dictionary(training_words):
dictionaries = Dictionaries()
for word in training_words:
dictionaries.first_letters[word[0]] += 1
for i in range(len(word) - 1):
dictionaries.dictionary[word[i]][word[i + 1]] += 1
dictionaries.dictionary[word[-1]][""] += 1
return dictionaries
def generate_words(first_letters, dictionary, num):
word_list = []
letters = list(first_letters.keys())
probability = list(first_letters.values())
for i in range(num):
word = list_to_string(random.choices(letters, weights=probability, k=1))
while True:
next_letters = list(dictionary[word[-1]].keys())
next_probability = list(dictionary[word[-1]].values())
next_letter = list_to_string(random.choices(next_letters, weights=next_probability, k=1))
if next_letter == '':
break
word += next_letter
word_list.append(word)
return word_list
def list_to_string(s):
new = ""
for x in s:
new += x
return new
def setup_parser():
hash_parser = argparse.ArgumentParser(description="Program which calculates time of adding, "
"enumerating and deleting elements from hashmap "
"consisting of singly linked lists")
group_req = hash_parser.add_argument_group("arguments")
group_req.add_argument("-i", "--input", required=True,
help="input file with words")
group_req.add_argument("-n", "--number", type=int, required=True,
help="number of words generated")
group_req.add_argument("-o", "--output", required=True,
help="output file")
group_req.add_argument("-s", "--seed", required=False, default=None,
help="optional seed for generation")
return hash_parser
if __name__ == "__main__":
parser = setup_parser()
args = parser.parse_args()
words_list = words(args.input, args.number, args.seed)
with open(args.output, 'w') as file:
for word in words_list:
file.write(str(word + " "))
print("Generated successfully to", args.output)