-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lecture notes.tex
329 lines (272 loc) · 13.4 KB
/
Lecture notes.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
%\documentclass[a4paper]{article}
\documentclass[landscape,twocolumn,a4paper]{article}
\usepackage[a4paper,vmargin={20mm,20mm},hmargin={20mm,10mm}]{geometry}
\author{Simon Wehrli}
\date{11.5.2013}
\title{Set similarity with k-permutation Minwise Hashing - NOTES}
\usepackage[parfill]{parskip}
\usepackage{tikz} % for drawing advanced matrices
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode}
\usepackage{natbib}
\usepackage{tabulary}
\usepackage{caption}
\usepackage{framed}
\usepackage[english]{varioref} %automatische Anpassung von Referenzen
\labelformat{equation}{(#1)} %make references to equations like (1)
\usepackage{float} % bewirkt, dass Option [H] für float-Umgebungen von Latex umgesetzt wird
\usetikzlibrary{matrix,decorations.pathreplacing,backgrounds,positioning}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\E}{E}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\newtheorem{mydef}{Definition}
\newtheorem{mylemma}{Lemma}
\newtheorem{mytheorem}{Theorem}
\newcommand*{\bfrac}[2]{\genfrac{\lbrace}{\rbrace}{0pt}{}{#1}{#2}}
\begin{document}
\maketitle
\section{Introduction}
Many today's applications are faced with very large datasets. A common task is to find \emph{similarity} between two or several such sets.
This notes are intended as a help for the talk and explains only the core concepts of \citep{LiK11} and \citep{LiOwZhang12}.
\subsection{Similarity}
We denote by $\Omega$ the set of all possible items of the sets $S_n \subseteq \Omega$, $n = 1 \text{ to } N$. $\left| \Omega \right| = D$ is always large (e.g. $D=2^{64}$). Often we consider only two sets $S_1 = X$, $S_2 = Y$. Let $a=|X \cap Y|,$ $b=|X|-a$ and $c=|Y|-a$. Figure \vref{fig:sets} summarizes these definitions graphically.
\begin{figure}[H]
\centering
\begin{tikzpicture}
% left hand
\scope
\clip (-2,-2) rectangle (2,2)
(1,0) circle (1);
\fill[fill=blue!30] (0,0) circle (1);
\endscope
% right hand
\scope
\clip (-2,-2) rectangle (2,2)
(0,0) circle (1);
\fill[fill=orange!30] (1,0) circle (1);
\endscope
% middle
\scope
\clip (-2,-2) rectangle (2,2)
(0,0) circle (1)
(1,0) circle (1);
\fill[fill=red!30] (1,0) circle (1);
\endscope
% outline
\draw (0,0) circle (1) (0,1) node [text=black,above] {$X$}
(1,0) circle (1) (1,1) node [text=black,above] {$Y$}
(-2,-2) rectangle (3,2) node [text=black,above] {$\Omega$};
\node[draw=none,text=blue,fill=white] at (-0.5,0) {$b$};
\node[draw=none,text=red,fill=white] at (0.5,0) {$a$};
\node[draw=none,text=orange,fill=white] at (1.5,0) {$c$};
\end{tikzpicture}
\caption{Two example sets in $\Omega$ and the notation $a,b,c$ for the sizes of important subsets.}
\label{fig:sets}
\end{figure}
\begin{framed}
\begin{mydef}\label{def:jaccard}
The normalized similarity between two sets $X$ and $Y$, known as \emph{resemblance} or \emph{Jaccard similarity}, denoted by R, is
\begin{equation}
R=\frac{\left| X \cap Y \right|}{\left| X \cup Y \right|} = \frac{a}{\left| X \right| + \left| Y \right| -a}=\frac{a}{a+b+c}
\end{equation}
\end{mydef}
\end{framed}
\paragraph{Other Notations}
Later we often look at some important event of a problem where we use $\Pr[\cdot]$ as a shorthand for the \emph{Probability} of that event. To get an estimator for $R$ out of a probabilistic argument, we always use some Bernoulli experiment. This is a process, where we repeatedly flip a possibly biased coin, but the bias does not change. We can see it as a sequence of binary random variables, because only two outcomes are possible, $0$ or $1$. An event is described by an equation. We use the notation $1\left\lbrace \text{\textit{equation}} \right\rbrace$ which is one if and only if the \textit{equation} in curly braces is true:
\begin{equation*}
1\left\lbrace \text{\textit{equation}} \right\rbrace =\begin{cases}
1 & \text{if \textit{equation} evaluates to \textit{true}} \\
0 & \text{otherwise}
\end{cases}
\end{equation*}
\pagebreak
\section{Original Minwise Hashing}
Suppose a random permutation $\pi$ is performed on $\Omega$\footnote{We assume there is a perfect hash function applied to the elements of the original domain which always gives us $\Omega = \{0,1,\ldots,D-1\}$. Note that in the paper \citep{LiK11}, the hash function is applied after the permutation and the minimum-function, but it is simpler to understand this way.},
\[
\pi:\Omega \longrightarrow \Omega, \qquad \text{where } \Omega = \{0,1,\ldots,D-1\}.
\]
To simplify notation, we overload the definition of $\pi$ to work also for subsets of $\Omega$. Thus with $\pi(S_n)$ we denote the application of the permutation $\pi$ to every element of the set $S_n$. More precisely,
\begin{equation*}
\begin{split}
\pi &: 2^\Omega \longrightarrow 2^\Omega, \\
\pi &: S_n \longmapsto \pi \left( S_n \right) = \left\lbrace \pi \left( i \right) \middle\vert i \in S_n \right\rbrace.
\end{split}
\end{equation*}
We define
\[
h_{S_n,\pi}=min(\pi(S_n))
\]
to be the smallest element of set $S_n$ permuted with $\pi_j$. Note that the smallest element changes under the permutations because $\pi$ is a permutation on $\Omega$ and $S_n$ is only a subset of $\Omega$ in general.
An elementary probability argument shows that for two sets $X,Y \subseteq \Omega$
\begin{equation}\label{eq:minwiseOri}
\Pr[h_{X,\pi}=h_{Y,\pi}]=\Pr [\min(\pi(X))=\min(\pi(Y))]=\frac{\left| X \cap Y \right|}{\left| X \cup Y \right|}=R.
\end{equation}
We can now build an unbiased estimator $\hat{R}_M$ of $R$ with $k$ minwise independent permutations, $\pi_1,\pi_2,\ldots,\pi_k$:
\begin{equation}
\hat{R}_M=\frac{1}{k}\sum_{j=1}^k 1 \left\lbrace h_{X,\pi_j}=h_{Y,\pi_j} \right\rbrace,
\end{equation}
\begin{equation}\label{eq:minWiseVariance}
Var(\hat{R}_M)=\frac{1}{k}R(1-R).
\end{equation}
\subsection{The Algorithm}
Based on the theoretical results, Algorithm \vref{alg:orgMinwiseHashing} presents the procedure of ($k$-permutation) \textsc{Minwise Hashing}.
\begin{algorithm}[H]
\caption{Original \textsc{Minwise Hashing} algorithm, applied to estimating pairwise resemblances in a collection of $N$ sets.}
\label{alg:orgMinwiseHashing}
\begin{algorithmic}
\Require Sets $S_n \subseteq \Omega = \{0,1,\ldots,D-1\}, n = 1 \text{ to } N$. \Comment $D = \left| \Omega \right|$
\Ensure Estimated resemblance $\hat{R}_M$
\State // Pre-processing
\State Generate $k$ random permutations $\pi_j: \Omega\longrightarrow\Omega, j=1\text{ to }k$
\ForAll{$n = 1 \text{ to } N, j = 1 \text{ to } k$}
\State Store $\min(\pi_j(S_n))$, denoted by $h_{S_n,\pi_j}$.
\EndFor
\State
\State // Estimation (Use two sets $X,Y$ as an example)
\State Estimate the resemblance by $\hat{R}_M=\frac{1}{k}\sum_{j=1}^k 1 \left\lbrace h_{X,\pi_j}=h_{Y,\pi_j} \right\rbrace$
\end{algorithmic}
\end{algorithm}
\section{One Permutation Hashing}
This algorithm is directly motivated by the optimization potential of the standard \textsc{Minwise Hashing} method: intuitively, it ought to be ``wasteful'' in that all elements in a set are permuted, scanned but only the minimum will be used. As the name already suggests, we reduce the preprocessing step to only one permutation.
We will setup a running example with $X,Y,Z \subseteq \Omega = \left\lbrace 0,1,\cdots,15\right\rbrace$. Let be $\pi$ some random permutation on $\Omega$ and the already permuted sets be
\[
\pi(X)=\left\lbrace 2,4,7,13\right\rbrace, \quad \pi(Y)=\left\lbrace 0,3,6,13\right\rbrace,\quad \pi(Z)= \left\lbrace 0,1,10,12\right\rbrace.
\]
Now again we build up a data matrix where the rows are equal to the vector representations of the permuted sets:
\begin{equation}
\begin{tikzpicture}[baseline = (M.center),% center with respect to the matrix center
every left delimiter/.style={xshift=1ex},%tighter delimiter spacing
every right delimiter/.style={xshift=-1ex}]
\matrix (M) [matrix of math nodes,left delimiter={(},right delimiter={)}
]{
0&0&1&0&1&0&0&1&0&0&0&0&0&1&0&0\\
1&0&0&1&0&0&1&0&0&0&0&0&0&1&0&0\\
1&1&0&0&0&0&0&0&0&0&1&0&1&0&0&0\\
};
\node[anchor=south east] (cornernode) at (M-1-1.north west) {Sets: \qquad \qquad}; %Position this more precisely if desired
\foreach[count=\xi] \x in {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}{ %\xi is the counter \x is the value
\node[font=\scriptsize] (M-0-\xi) at (cornernode -| M-1-\xi) {\x}; % Gets the top row
}
\foreach[count=\xi] \x in {$\pi(X)$:,$\pi(Y)$:,$\pi(Z)$:}{ %\xi is the counter \x is the value
\node (M-\xi-0) at (cornernode |- M-\xi-1) {\x}; %Gets the left most column
}
\draw[decoration=brace,decorate] (M-0-1.north west) -- (M-0-16.north east)%
node[midway,above] {$\Omega$};
\end{tikzpicture}
\end{equation}
The idea is to divide the columns evenly into $t$ (here $t=4$) bins (parts), take the minimum in each bin. Because later we only compare minima within one bin, we can re-index the elements to use the smallest possible representation:
\begin{equation} \label{matrix:complete}
\begin{tikzpicture}[baseline = (M.center),% center with respect to the matrix center
every left delimiter/.style={xshift=1ex},%tighter delimiter spacing
every right delimiter/.style={xshift=-1ex}]
\matrix (M) [matrix of math nodes,left delimiter={(},right delimiter={)}
]{
0&0&1&0&1&0&0&1&0&0&0&0&0&1&0&0\\
1&0&0&1&0&0&1&0&0&0&0&0&0&1&0&0\\
1&1&0&0&0&0&0&0&0&0&1&0&1&0&0&0\\
};
\node[anchor=south east] (cornernode) at (M-1-1.north west) {Sets: \qquad \qquad}; %Position this more precisely if desired
\foreach[count=\xi] \x in {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}{ %\xi is the counter \x is the value
\node[font=\scriptsize] (M-0-\xi) at (cornernode -| M-1-\xi) {\x}; % Gets the top row
}
\foreach[count=\xi] \x in {$\pi(X)$:,$\pi(Y)$:,$\pi(Z)$:}{ %\xi is the counter \x is the value
\node (M-\xi-0) at (cornernode |- M-\xi-1) {\x}; %Gets the left most column
}
\node[below left = 0.1pt of M-3-3.south] {bin $1$};
\node[below left = 0.1pt of M-3-7.south] {bin $2$};
\node[below left = 0.1pt of M-3-11.south] {bin $3$};
\node[below left = 0.1pt of M-3-15.south] {bin $4$};
\draw[decoration=brace,decorate] (M-0-1.north west) -- (M-0-4.north east)%
node[midway,above] {$\Omega'$};
\draw[decoration=brace,decorate] (M-0-5.north west) -- (M-0-8.north east)%
node[midway,above] {$\Omega'$};
\draw[decoration=brace,decorate] (M-0-9.north west) -- (M-0-12.north east)%
node[midway,above] {$\Omega'$};
\draw[decoration=brace,decorate] (M-0-13.north west) -- (M-0-16.north east)%
node[midway,above] {$\Omega'$};
\begin{scope}[on background layer]
\draw [fill=black!10] (M-1-1.north west) rectangle (M-3-4.south east);
\draw [fill=black!10] (M-1-5.north west) rectangle (M-3-8.south east);
\draw [fill=black!10] (M-1-9.north west) rectangle (M-3-12.south east);
\draw [fill=black!10] (M-1-13.north west) rectangle (M-3-16.south east);
\draw [draw=red] (M-1-3.center) circle (1.5ex);
\draw [draw=red] (M-2-1.center) circle (1.5ex);
\draw [draw=red] (M-3-1.center) circle (1.5ex);
\draw [draw=red] (M-1-5.center) circle (1.5ex);
\draw [draw=red] (M-2-7.center) circle (1.5ex);
\draw [draw=red] (M-3-11.center) circle (1.5ex);
\draw [draw=red] (M-1-14.center) circle (1.5ex);
\draw [draw=red] (M-2-14.center) circle (1.5ex);
\draw [draw=red] (M-3-13.center) circle (1.5ex);
\end{scope}
\end{tikzpicture}
\end{equation}
We get the minima-vectors
\begin{equation}
\begin{split}
v_X=[2,0,*,1],\\
v_Y=[0,2,*,1],\\
v_Z=[0,*,2,0],\\
\end{split}
\end{equation}
where '$*$' denotes an empty bin.
To derive the \emph{resemblance} between two sets, e.g. $X,Y$, we introduce two definitions:
\begin{equation} \label{eq:defEmpjMatj}
\begin{split}
\text{number of ``jointly empty bins'':} \qquad & N_{emp}=\sum_{j=1}^t I_{emp,j}, \\
\text{number of ``matched bins'':} \qquad & N_{mat}=\sum_{j=1}^t I_{mat,j},\\
\end{split}
\end{equation}
where $I_{emp,j}$ and $I_{mat,j}$ are defined for the $j$-th bin, as
\begin{equation} \label{eq:defEmpMat}
\begin{split}
I_{emp,j} &=\begin{cases}
1 & \text{if both $\pi(X)$ and $\pi(Y)$ are empty in the $j$-th bin} \\
0 & \text{otherwise}
\end{cases}\\
I_{mat,j} &=\begin{cases}
1 & \text{if both $\pi(X)$ and $\pi(Y)$ are not empty and the smallest} \\
& \text { elements in the $j$-th bin matches, i.e. $(v_X)_j=(v_Y)_j$} \\
0 & \text{otherwise}
\end{cases}
\end{split}
\end{equation}
\pagebreak
\subsection{The Estimator}
We formulate the estimator as
\begin{framed}
\begin{mylemma} \label{lem:onePermutationHashing}
The \emph{resemblance} is estimated by
\begin{equation}
\hat{R}_{mat}=\frac{N_{mat}}{k-N_{emp}}
\end{equation}
is unbiased, i.e.
\begin{equation}
\E\left[\hat{R}_{mat}\right]=R.
\end{equation}
\end{mylemma}
\end{framed}
In our example we have $N_{emp}=1$ and $N_{mat}=1$. Thus $\hat{R}_{mat}=1/3$.
Empirical results show that the \textsc{One Permutation Hashing} scheme performs as well or even slightly better than the \textsc{b-bit k-permutation hashing} scheme.
\begin{table}[h]
{
\begin{tabulary}{1\textwidth}{LLL}
\hline \textbf{Space used for \ldots}
& storing permutation(s) & storing pre-processing data per set \\
\hline Na\"{i}ve approach (store whole sets) & --- & $O(D)$ \\
\hline original \textsc{Minwise Hashing} & $O(kD\log(D))$ & $O(k\log(D))$ \\
\hline \textsc{b-bit Minwise Hashing} & $O(kD\log(D))$ & $O(kb)$ \\
\hline \textsc{One Permutation Hashing} & $O(D\log(D))$ & $O(t \log(D/t))$ \\
\hline
\end{tabulary}
}
\caption{Algorithm comparison. We only list the space complexity because the processing time boundaries are given by the time needed to read the stored data at least once and hence is dominated by the space complexity.}
\label{table:comparison}
\end{table}
\bibliographystyle{unsrt}
\bibliography{Bibliography}
\end{document}