-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathner_vocabulary_creator.py
122 lines (115 loc) · 6.21 KB
/
ner_vocabulary_creator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy
import nltk
raw_data_set = []
raw_data_set.append({'word': "remember", 'entity': "o"})
raw_data_set.append({'word': "that", 'entity': "o"})
raw_data_set.append({'word': "this", 'entity': "o"})
raw_data_set.append({'word': "something", 'entity': "o"})
raw_data_set.append({'word': "create", 'entity': "o"})
raw_data_set.append({'word': "note", 'entity': "o"})
raw_data_set.append({'word': "my", 'entity': "o"})
raw_data_set.append({'word': "a", 'entity': "o"})
raw_data_set.append({'word': "on", 'entity': "o"})
raw_data_set.append({'word': "is", 'entity': "o"})
raw_data_set.append({'word': "never", 'entity': "o"})
raw_data_set.append({'word': "mind", 'entity': "o"})
raw_data_set.append({'word': "cancel", 'entity': "o"})
raw_data_set.append({'word': "it", 'entity': "o"})
raw_data_set.append({'word': "I", 'entity': "o"})
raw_data_set.append({'word': "in", 'entity': "o"})
raw_data_set.append({'word': "at", 'entity': "o"})
raw_data_set.append({'word': "inside", 'entity': "o"})
raw_data_set.append({'word': "have", 'entity': "o"})
raw_data_set.append({'word': "kept", 'entity': "o"})
raw_data_set.append({'word': "put", 'entity': "o"})
raw_data_set.append({'word': "forget", 'entity': "o"})
raw_data_set.append({'word': "what", 'entity': "o"})
raw_data_set.append({'word': "where", 'entity': "o"})
raw_data_set.append({'word': "whom", 'entity': "o"})
raw_data_set.append({'word': "did", 'entity': "o"})
raw_data_set.append({'word': "give", 'entity': "o"})
raw_data_set.append({'word': "gave", 'entity': "o"})
raw_data_set.append({'word': "ask", 'entity': "o"})
raw_data_set.append({'word': "of", 'entity': "o"})
raw_data_set.append({'word': "to", 'entity': "o"})
raw_data_set.append({'word': "book", 'entity': "keyword"})
raw_data_set.append({'word': "table", 'entity': "keyword"})
raw_data_set.append({'word': "trunk", 'entity': "keyword"})
raw_data_set.append({'word': "enrollment", 'entity': "keyword"})
raw_data_set.append({'word': "number", 'entity': "keyword"})
raw_data_set.append({'word': "aadhar", 'entity': "keyword"})
raw_data_set.append({'word': "pancard", 'entity': "keyword"})
raw_data_set.append({'word': "pan", 'entity': "keyword"})
raw_data_set.append({'word': "desk", 'entity': "keyword"})
raw_data_set.append({'word': "pocket", 'entity': "keyword"})
raw_data_set.append({'word': "table", 'entity': "keyword"})
raw_data_set.append({'word': "specific", 'entity': "keyword"})
raw_data_set.append({'word': "heat", 'entity': "keyword"})
raw_data_set.append({'word': "gravity", 'entity': "keyword"})
raw_data_set.append({'word': "wrist", 'entity': "keyword"})
raw_data_set.append({'word': "watch", 'entity': "keyword"})
raw_data_set.append({'word': "printer", 'entity': "keyword"})
raw_data_set.append({'word': "catridge", 'entity': "keyword"})
raw_data_set.append({'word': "ink", 'entity': "keyword"})
raw_data_set.append({'word': "mobile", 'entity': "keyword"})
raw_data_set.append({'word': "mouse", 'entity': "keyword"})
raw_data_set.append({'word': "bike", 'entity': "keyword"})
raw_data_set.append({'word': "lock", 'entity': "keyword"})
raw_data_set.append({'word': "combo", 'entity': "keyword"})
raw_data_set.append({'word': "cobination", 'entity': "keyword"})
raw_data_set.append({'word': "last", 'entity': "keyword"})
raw_data_set.append({'word': "month", 'entity': "keyword"})
raw_data_set.append({'word': "electricity", 'entity': "keyword"})
raw_data_set.append({'word': "bill", 'entity': "keyword"})
raw_data_set.append({'word': "mom", 'entity': "keyword"})
raw_data_set.append({'word': "rugved", 'entity': "keyword"})
raw_data_set.append({'word': "preet", 'entity': "keyword"})
raw_data_set.append({'word': "favorite", 'entity': "keyword"})
raw_data_set.append({'word': "color", 'entity': "keyword"})
raw_data_set.append({'word': "blue", 'entity': "keyword"})
raw_data_set.append({'word': "manual", 'entity': "keyword"})
raw_data_set.append({'word': "spent", 'entity': "keyword"})
raw_data_set.append({'word': "lunch", 'entity': "keyword"})
raw_data_set.append({'word': "wallet", 'entity': "keyword"})
raw_data_set.append({'word': "bill", 'entity': "keyword"})
raw_data_set.append({'word': "parked", 'entity': "keyword"})
raw_data_set.append({'word': "park", 'entity': "keyword"})
raw_data_set.append({'word': "car", 'entity': "keyword"})
raw_data_set.append({'word': "bike", 'entity': "keyword"})
raw_data_set.append({'word': "cycle", 'entity': "keyword"})
raw_data_set.append({'word': "G-12", 'entity': "keyword"})
raw_data_set.append({'word': "first", 'entity': "keyword"})
raw_data_set.append({'word': "floor", 'entity': "keyword"})
raw_data_set.append({'word': "second", 'entity': "keyword"})
raw_data_set.append({'word': "lot", 'entity': "keyword"})
raw_data_set.append({'word': "space", 'entity': "keyword"})
raw_data_set.append({'word': "passport", 'entity': "keyword"})
raw_data_set.append({'word': "bottom", 'entity': "keyword"})
raw_data_set.append({'word': "drawer", 'entity': "keyword"})
raw_data_set.append({'word': "filling", 'entity': "keyword"})
raw_data_set.append({'word': "cabinet", 'entity': "keyword"})
raw_data_set.append({'word': "spare", 'entity': "keyword"})
raw_data_set.append({'word': "key", 'entity': "keyword"})
raw_data_set.append({'word': "truck", 'entity': "keyword"})
raw_data_set.append({'word': "outside", 'entity': "keyword"})
raw_data_set.append({'word': "hidden", 'entity': "keyword"})
raw_data_set.append({'word': "alex", 'entity': "keyword"})
raw_data_set.append({'word': "called", 'entity': "keyword"})
raw_data_set.append({'word': "watered", 'entity': "keyword"})
raw_data_set.append({'word': "hermit", 'entity': "keyword"})
raw_data_set.append({'word': "crabs", 'entity': "keyword"})
raw_data_set.append({'word': "morning", 'entity': "keyword"})
words = []
entities = []
documents = []
ignore_words = ["?", "'s"]
for pattern in raw_data_set:
words.append (pattern['word'])
entities.append (pattern['entity'])
documents.append ((pattern['word'], pattern['entity']))
words = [w.lower () for w in words if w not in ignore_words]
words = list(set(words))
words.append ("$unknown")
words.append ("$digit")
numpy.savez_compressed ('ner_vocabulary', words = words, entities = entities, documents = documents)
print ("Vocabulary saved")