-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_setup.py
executable file
·70 lines (61 loc) · 1.96 KB
/
data_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
'''
segment data using spacy
create frequency dict from THUCnews articles
'''
import spacy, torch
import os
import math
from cjklib import characterlookup
import re
nlp = spacy.load("zh_core_web_sm")
def segment_thucnews():
path = "data/cleaned_THUCNews"
for dir in os.listdir(path):
if dir.startswith('.'):
continue
dir_path = path + '/' + dir
for file in os.listdir(dir_path):
filename = dir_path + '/' + file
print(filename)
write_to = open("data/segmented_THUCNews" + '/' + dir + '/' + file, 'w')
try:
text = open(filename, 'r').read().replace('\u3000', '')
except:
continue
doc = nlp(text)
for token in doc:
write_to.write(token.text + ' ')
def segment_test():
path = "data/test/text"
for file in os.listdir(path):
file_path = path + '/' + file
write_to = open("data/test/segmented_text/" + file, 'w')
text = open(file_path, 'r').read().replace('\u3000', '')
doc = nlp(text)
for token in doc:
write_to.write(token.text + ' ')
'''
create and save dict of words from THUCnews w word frequencies
'''
def count_frequencies():
FREQUENCY_DICT = {}
path = "data/segmented_THUCNews"
for dir in os.listdir(path):
if dir.startswith('.') or dir.__contains__('annotations'):
continue
dir_path = path + '/' + dir
for file in os.listdir(dir_path):
filename = dir_path + '/' + file
text = open(filename, 'r').read().replace('\n', '')
segments = text.split(' ')
for s in segments:
if s in FREQUENCY_DICT:
FREQUENCY_DICT[s] += 1
else:
FREQUENCY_DICT[s] = 1
torch.save(FREQUENCY_DICT, 'data/frequency_dict')
def main():
count_frequencies()
if __name__ == "__main__":
main()