-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordfreqcount.py
58 lines (48 loc) · 1.63 KB
/
wordfreqcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#! python3
# -*- coding: utf-8 -*-
import os, codecs
import jieba
from collections import Counter
ROOT_PATH = "/home/vagrant/PycharmProjects/Collection2/autohome/"
def get_words(txt):
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
if len(x) > 1 and x != '\r\n':
c[x] += 1
print('常用词频度统计结果')
k_list = []
for (k, v) in c.most_common(5000):
k_list.append(k)
print('%s%s %s %d' % (' ' * (5 - len(k)), k, '*' * 5, v))
return k_list
def unitfile():
filename_list = ['content_advice.txt', 'content_drive.txt', 'content_tech.txt']
f = open("other_data_file/content_adv_dri_tech.txt", "w")
f.close()
count = 0
for item in filename_list:
f = open(ROOT_PATH + "auto_clean_sentence_txt/" + item, "r")
lines = f.readlines()
count += len(lines)
f_totle = open("other_data_file/content_adv_dri_tech.txt", "w")
for line in lines:
f_totle.write(str(line))
print(count)
f_totle.close()
def main():
# unitfile()
# new_file_path = ROOT_PATH + "auto_clean_sentence_txt/content_adv_dri_tech.txt"
new_file_path = "split_data.txt"
# high_freq_word = ROOT_PATH + "auto_clean_sentence_txt/adv_dri_tech_high_freq_word_2.txt"
high_freq_word = "zhangyan_data/freq_word_zhangyan.txt"
with codecs.open(new_file_path, 'r', 'utf8') as f:
txt = f.read()
with codecs.open(high_freq_word, 'w', 'utf8') as f_w:
words = get_words(txt)
for word in words:
f_w.write(word + '\n')
f.close()
f_w.close()
if __name__ == '__main__':
main()