-
Notifications
You must be signed in to change notification settings - Fork 6
/
analysis.py
75 lines (57 loc) · 1.86 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#encoding=utf-8
import sqlite3
import jieba
import jieba.posseg as pseg
#connect db
cx = sqlite3.connect("./user.db")
cu = cx.cursor()
#init jieba
#jieba.load_userdict("c:/mydict.txt")
#parase single user
def analysis(session, userid):
sql = "select * from msgs where (nick=? or userid=?) and session=?"
result = cu.execute(sql, (userid, userid, session))
msgs = result.fetchall()
str = ""
for i in range(len(msgs)):
str += msgs[i][5]
ts = tokens(str)
dslist = []
for key in ts.keys():
dslist.append((session, userid, key, ts[key][0], ts[key][1],))
cu.executemany("insert into tokens(sessionid, userid, token, count, flag) \
values (?, ?, ?, ?, ?) ;", dslist)
def tokens(str):
'''
cut the str and return tokens map
'''
words = pseg.cut(str)
result = {}
for w in words:
if w.word in result:
result[w.word][0] += 1
else:
result[w.word] = [1, w.flag]
return result
#analysis all by session id
sql = 'select distinct session from msgs'
ress = cu.execute(sql).fetchall()
count = 0
print("session count: ", len(ress))
for i in range(len(ress)):
session = ress[i][0]
#1. nomal msg
if not session.startswith(u"我的QQ群"):
sql = "select distinct nick from msgs where session = '%s'" % session
sub_ress = cu.execute(sql).fetchall()
for sub_i in range(len(sub_ress)):
analysis(session, sub_ress[sub_i][0])
#2. qun msg
else:
sql = "select distinct userid from msgs where session = '%s'" % session
sub_ress = cu.execute(sql).fetchall()
for sub_i in range(len(sub_ress)):
analysis(session, sub_ress[sub_i][0])
cx.commit()
count += 1
print('process session ', count, 'finished.')