-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy path04rel.sql
143 lines (115 loc) · 2.89 KB
/
04rel.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
-- coo_rel
drop index if exists idx_view_coo_rel;
create index idx_view_coo_rel on view_log (uid, session_id, item_id);
drop table if exists coo_rel;
create table coo_rel as
select
*
from (
select
target_item_id,
cross_sell_item_id,
score,
row_number() over (partition by target_item_id order by score desc) rank
from (
select
a.item_id target_item_id,
b.item_id cross_sell_item_id,
count(*) score
from view_log a
inner join view_log b on
a.uid = b.uid and
a.session_id = b.session_id
group by a.item_id, b.item_id
) a
) a;
drop index if exists idx_coo_rel;
create index idx_coo_rel on coo_rel(target_item_id, rank);
-- td-idf
-- 1. idf
drop index if exists idx_tfidf_1;
drop index if exists idx_tfidf_2;
create index idx_tfidf_1 on view_log (session_id);
create index idx_tfidf_2 on view_log (item_id, session_id);
select count(distinct session_id)
from view_log; -- N
select item_id, count(session_id) df
from view_log
group by item_id; -- df
drop table if exists item_idf;
create table item_idf as
select item_id,
log((select count(distinct session_id) from view_log)/
(count(distinct session_id) + 1)) idf
from view_log a
group by item_id;
create index idx_idf on item_idf(item_id, idf);
-- 2. tf-idf by session(document)-item(word)
drop table if exists tfidf;
create table tfidf as
select
a.session_id,
a.item_id,
a.cnt*b.idf tfidf
from (
select session_id, item_id, count(*) cnt
from view_log
group by session_id, item_id
) a
inner join item_idf b on a.item_id = b.item_id;
drop index if exists idx_tfidf;
create index idx_tfidf on tfidf(session_id, item_id, tfidf);
-- 3. rel
drop table if exists tfidf_rel;
create table tfidf_rel as
select
*
from (
select
target_item_id,
cross_sell_item_id,
score,
row_number() over (partition by target_item_id order by score desc) rank
from (
select
a.item_id target_item_id,
b.item_id cross_sell_item_id,
sum(c.tfidf) score
from view_log a
inner join view_log b on a.uid = b.uid and a.session_id = b.session_id
inner join tfidf c on a.session_id = c.session_id and b.item_id = c.item_id
group by a.item_id, b.item_id
) a
) a;
-- coo - cosine
drop table if exists coo_cs;
create table coo_cs as
select
item_id,
session_id,
count(distinct session_id) / (sum(count(distinct session_id)) over (partition by item_id)) cs
from view_log
group by item_id, session_id;
drop index if exists idx_coo_cs;
create index idx_coo_cs on coo_cs(session_id, item_id, cs);
-- 3. rel
drop table if exists coo_cs_rel;
create table coo_cs_rel as
select
*
from (
select
target_item_id,
cross_sell_item_id,
score,
row_number() over (partition by target_item_id order by score desc) rank
from (
select
a.item_id target_item_id,
b.item_id cross_sell_item_id,
max(a.cs) * max(b.cs) score
from coo_cs a
inner join coo_cs b on a.session_id = b.session_id
group by a.item_id, b.item_id
) a
) a;