-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
94 lines (79 loc) · 3.12 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
This Python script (re)generate topics
from Google Spreadsheets to MongoDB JSON collection
"""
import sys
import json
import itertools
import pygsheets
import regex
from slugify import slugify
class TopicsExtractor:
def __init__(self, verbose=False, args=list()):
self.GOOGLE_DRIVE_CREDENTIALS_FILE = "credentials.json"
self.DATA_REFERENCE_FILE = args[1] if len(args) > 1 else ""
self.verbose = verbose
self.topics = list()
self.data_reference = list()
self.google_credentials = None
def __regex_validation(self, tagname, regex_pattern):
try:
regex.compile("(?i)" + regex_pattern)
except Exception as e:
print(tagname, e)
def __validate(self, tag):
if tag["shuffle"]:
delimiter = ".*?" if ".*?" in tag["regex"] else ".*"
perms = itertools.permutations(tag["regex"].split(delimiter))
for perm in perms:
self.__regex_validation(tag["tag"], delimiter.join(perm))
else:
self.__regex_validation(tag["tag"], tag["regex"])
def load_data_reference(self):
with open("./data/" + self.DATA_REFERENCE_FILE, "r") as data_reference_file:
self.data_reference = json.load(data_reference_file)
def get_knowledge_base(self):
return self.DATA_REFERENCE_FILE.split(".")[0]
def load_google_credentials(self):
self.google_credentials = pygsheets.authorize(
service_account_file=self.GOOGLE_DRIVE_CREDENTIALS_FILE
)
def load_topics(self):
for data_reference_item in self.data_reference:
if self.verbose:
print("[EXTRACT] {}".format(data_reference_item["name"]))
wks = self.google_credentials.open(data_reference_item["filename"]).sheet1
topic = data_reference_item.copy()
del topic["filename"]
topic["_id"] = slugify(topic["shortname"].lower())
topic["tags"] = []
data = wks.get_values(
grange=pygsheets.GridRange(worksheet=wks, start=None, end=None)
)
for row in data[1:]:
if row[0] == "" and row[1] == "" and row[2] == "" and row[3] == "":
continue
tag = {
"regex": row[3],
"tag": row[2],
"subtopic": row[1],
"shuffle": bool(int(row[0])),
}
self.__validate(tag)
topic["knowledgebase"] = self.get_knowledge_base()
topic["public"] = True
topic["tags"].append(tag)
self.topics.append(topic)
def export_topics(self):
file_out = open("topics.json", "w")
file_out.write(json.dumps(self.topics, indent=True, ensure_ascii=False))
file_out.close()
print("[EXPORT] Created topics.json file")
def run(self):
self.load_data_reference()
self.load_google_credentials()
self.load_topics()
self.export_topics()
if __name__ == "__main__":
extractor = TopicsExtractor(verbose=True, args=sys.argv)
extractor.run()