From 979e190308500660f056a80e0589b981a1daa25b Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Mon, 3 Jul 2023 13:05:06 +0100 Subject: [PATCH 01/23] initial commit --- docs/img/blocking/pairwise_comparisons.png | Bin 0 -> 52182 bytes docs/topic_guides/blocking_model_training.md | 111 +++++++++++++ docs/topic_guides/blocking_predictions.md | 49 ++++++ docs/topic_guides/blocking_rules.md | 162 ++++--------------- docs/topic_guides/drivers_of_performance.md | 2 + docs/topic_guides/settings.md | 4 +- mkdocs.yml | 6 +- 7 files changed, 196 insertions(+), 138 deletions(-) create mode 100644 docs/img/blocking/pairwise_comparisons.png create mode 100644 docs/topic_guides/blocking_model_training.md create mode 100644 docs/topic_guides/blocking_predictions.md diff --git a/docs/img/blocking/pairwise_comparisons.png b/docs/img/blocking/pairwise_comparisons.png new file mode 100644 index 0000000000000000000000000000000000000000..c0489d7572e5f28bb2a6dab856d5b25f4ec82159 GIT binary patch literal 52182 zcma%D1yog8xK=tuE+rt{(jrJVC`bxOmox&>ozfwlB8?!Of;32{q;z+8Nxyv^b)0$g z)_QBP)|}@thLFz zn%L)D=d5T{v|m^6vfX#+R&QEhhRdu~`#_3P5a!RnG}7=87(VR7FEZh`kpKRBLoVnk zBSLx`^Y1@T2u7+P`hb2h{h!~b1fL`L`$OOh|M=WIh2?w`JeTuPwuaMkjCChfoKKNA zex+EJY|6d8ukox|8t!My_$HLAy-86w*QX8g8R8h72^qK51D3~WYU-&4JwD0>#Q+Y*SiAw=~^D0RkmNcz#sFSlNe z!>?C@YM#9WA9s>jXM>F`bSBDejP%Inw_HUwrgXPh-6ZrkTtC#*)j*;c*pSVSI=<|@)V z?be-YJgzgJ=H_z!UhIsQuT@u__ekqe6~>y@SUHF=Cz<_ePdC!b_5F4dW%T#rrX%?g zlNJqp(j0YViw40u-8En;&Q=mEzNx=`8gd_hH(4{a;d~>nYtwCOj77ED_El-f!q<_B za#PJy#yWUw=Zf9?*sRpix8R+Htx6wZB5`<%dxp-q@3-P^3|p{yUXSU8Dp^g`3-r>qu1zqekQt=M&zWwtCwQ`fe1`m(UD4MW?@DD__#KbJRKGBTo zO3M|zYp{rSU)(fU&I?y!|+h?+Hjc{jyz0XgS+npdR`F4w?2j^NwKhqot7 ziN_vleS7<@!a^IL%R1xSVvP4Z*s76uV>plIyG*&otd?7-KY7C}(Pj;DP0JK{87#7W zu$9^@QNYk@iig(tAPE1ACZ31rn$@;)WYRRU$x=J$O+72UNqO}~?{F>MPNyA)Pl@#2 zB)#FOr}>`zR1yi^JE`$A#%G`1LY(^}1?n6^4~U4@UNSp=fkn+Lia4$+N$YrI?-{yl zpCg|c`RQekMeV-dxkqr3L=2tPHuhoQ{>y8LIHm%#o$^sF1@zPkpxnG8Lw=rW)bUC$>)98Q+ckpzDqr%n0zQNZ zTyy3MR){L6@Vn`M%a+TPt1&t|o%K*IiT6dp37Q}?JtXuAh=C*oS$az{*RiRliqYRe z-&VJ%%j7qisjYI|Yeali+Ng)N@7WiQhkXvgcEy+po!_@8_yEVJ~F*GxH zq@?-vI>h0?L21jXDS3rbro#Bco2Ns|3=`TZ@Myw&vs-VX6D2%#L}Ge#q! zo2*&rH+cjOSi-r zqn2;)V!EV0VW6I;SUW(NGK&v$1eR*%mWJK1B3;q6%-$TDPADX;!v|5ON3krVE`p57 zu{n}nnW-Hf6+PH7JS=uxh;Q$E>RY-(0&XEb;e_U2^uug*5JDw=fa` zExzwhsaA7mBmxmok(b|9{-ZypFJrE&C2V?Jfx5T^n4y7aPJXdJ2 z&?#L%nLFuwMrOi|wlIv^f|G{%<416D%R*xHI?aBx14w-(R4w(F2a-7G7J(Mu-1_)V zvq>IHx(el!H^d;?&HLjXtCP{$BY9kHYQq=RN03JEz40QE&#ULMS$Pn~QLF!b|7nvt z98!^SvYI5^qCfVGB&8!28Nz@ACH+=o7phD)NdnW&<*Gm)Z*0Ksa9&(60f)>L8aa&B zJW`sVren63iXAJ$Q7bV?Se1-~1g0ZPeQ0m#<1%tI=t;}ArosAe}dz__LHu6RJeui7Klj| zxr)v1?SI#lm1BkTein2oZ>b>mi;eM2d~VPzJwy=UV2?H#d_oy3h zP|F@pVt>~Nf%n>x2WEh{h_K#8f?ul-LHs^nB3fAf>|35-$|Hjy6t3ci1F$KpQA~^d zk-ofrtGpa5*q(#>UFS)YpF|>(mk$KK_WoyHMaS@t2nzqpy=L^z1%k8(-y=Sqn>)27 zek9VN`99FnjMPLGzD4s+g`jcE0tF(l6}|8pv%EI*1narwg1?;vihCv59GO;lyRNQ> z_Zt$M05u<+xeQ2!Uhz3X-{t4mlaYMMkSmDT8m#Ve1gT&?y;2DrH@}Dq7N)RGG5eel z!e{*WVt!~kwJw3HcGGlT&y0J)8JWONuJ>j72n{_VHb9JotaSd)7l%7A1Qs}Z)NKp+ zZ&~#EKdppu!aI*7I)rk+#SFk+Ly{HZeb{n;A*?_;#2ucyRu2NdXQH0q&;UTF?0vBG?gCo(WZcGV?h%rHb3WceIQj(UFI z9XMW#Vs)5zHJ<6zJuW@P(qVQSgSFrYrUmnf&i?0o?&UVxz{*DQJwZ7VBXN7-G!d~? zbWWkyUl(+zTipyd+@37#Ih@%_>4e`}2=6m^;q{u`ddYihoG$oUBgV+Yf_b%9a0In7 z%+FH=>558~J&CKHVKYFOL*aB_<}K>p`ymDKG7}QN{HAl-`NQG}%_`jUob4SEkLmCov z#KR;u$wx%2brTLi-6BL5#;_nR=E6G+#is2q*e#+8WhD4=U&q%^MWcV$*r>bxAayc~ zE`+9kdLmriGqlS8@D@1{vV}o-z8iVT7Mnb&%R0dbs}Jn_YoZx>)mM#P@EhBa1Z~)B zs)vXPFnp%HJYq>tkVUc9_EehR8W3S2vc(GeS0D=*eya_|&NMMEX z^toqf)Z=_CN(et(gc-7_DJ=c>c~3-Qd!IDHBXV2VcJzfC7W;r#wM^|{HOu9;JbuP7 zM0fJhk{n+avxh8r-qOJkH|Jdg+i+UpRnqOSv(nybaIj$rgm4f$*4&!(t{dZsp771Q z%6pA$OYhnNCYlLjFSJ=He-Gl5J$L&~0t1;l=D8(O{M`P>J>122b4IQ-*C&&$-Mos0+Lc; zrrmqeab#did{hi61N0m}6K++d|3?ID#0rdZT7siNkXPk1<;5EFDGUUIy=Q&e(gsGM~c z)`p<UsyYRKgW0Zl2 zu=F|(o$fgLy_825PpV2M?1s$>c zIBt(U2wKvg?eK>@H0|Ag{>Gn-my=BA)595+DjmXce&S-sp7h1hheNIa_P_cV2}|br z9WD`$TY_9>s=A&qD|Qa?jsVkMx;imOfW&3Ldcrbk|2_r!Z9g9clyH(&M)fQWl^OYq zrwss3NvA`SlzQ0V5VUTqTWvuQI3rq_m@Gd&^r05@nodAz+_be#dV&OlA>e%S-53dS zxu>?jX>WyV8kLPx6FRyIQB@Ki3qGbmk-LQv*dSE-Wad4JEHkeO5*LiZWkY#Me=fQa z(`1;L4J(3V>SVLQbCL-`6(742%xgbKk4>GPXQ`8mquU82A5@d6zUBAref$!X(@x6+ z>5kz|`@{HqH033mBfZNrb#R?8QwVw*8WZD%KDC1eA3sVrizk z{JDHdD`d{C-kbWck=7F=NR=SA>^iiARw55i_(=0xojIGD(Pr+d zW({0!YTwAiV9aZt>=r$yK0^PjM>HPL;LX*gQx_1M&p|a}m2gt&$m^OeAigiG5#Anx z_T;4T`i<$$9aEGY!Y>h_C+d;YWu9N_zPqb$!q0H)M&(>xV12FMe;K%oJ;LKsy6a)p z%&B(4YshlFtNFA~$TNAsDEx}%HZ0tpGvSS3>gLguiq*(KXtB4r-y45i$O6R(i-UQ% zU@hq#crSj8Q^AnA_BP391aP=8D_!&a+waiQ_X=CCuN2dLjw zEqKV=VG&;5k@I&&auXLVg{!7If#9MM1lq%qw4w9(6FsRxLZJ7h8SY)Pxil3K zJNE0)g>l;2*}Y1=tr2(F3b*EJ-?PR9l>EnzBDnDjETJRyIb1J!)*KB8*B!Daa;3-=HcuyBj{#j=00aH_F@I*3hjzMBXq{^r*o zX)xS_{$@`4Ev!F?J;7Kiu$2t5l3{87Aoq|cZg0a7#R>7AuYJgA3ni!mG!u2Dpf14E zrH=c$c84T|`h{9`&i7?X4(3bIr@s43-m;X$GpG5)B;Uy7FvS+IS!U&eKM@j}cyRW5 zbjB~`n#v2cP^|s6o#|+yVw}(13Fl@TcZEiiaq*wS(YJ@0fIkPQl9^-LHmW*(TmnF( zPjeHR8k&5@{iJp{Oy)~X!vhKIuOkV}E6H#ivRo@;P)BVR-_dHohUVn~UXA{PSXk`w z<|u06T-)#%l)q~yc9GI#benZM&CKOd;uqMSp!XL)8EQz?uq@zgI~!Kkrv2=At7+SX zfW!R7P33&#I9kdg#qZhhEXM3->W^dAE)-n=oZ8UCFN$9qJ(iFk7RKsiSJdh&CO zCoJN}uFdEo(LhW(=ECczYQ---%S?tPET6FRu2)7nEVKk}e7e56=mumW?OGV0-OkiF zOG=8p4Im;luF75wxgIk=h_m0G$nhuC(9f z@SZX&)tFaJ0k(`TaTTzgoo=0R%qhY6tWT+BledB-4LMvx!+wR>8@MJ3!5Tpiz`w@K zLD{CQ$qHG&yF|I7+&(r{*2~{!3#(S{w;FwUb6cXnH;IQM9`|~db{0ojgG^b=DHA0| zNLU=XNo$VQVlX2RiXSJ+UVY%dnDa)@a5`L}+nG+DD+rCX0Fzcf(W-unp6~mdck%^p zo@RxHTA6WLlP~1PJQGTZQiSoQ6UXXWK}n^2)lbwo6kF)o+70e1uissf|Gl(#MCO`9 zE;3CzFXd}rnhU8|mt|nr(MM=PptWa1?(nCP(LpN#sHj?*<=gdd2PxdP;NI&6M@ zc@~u?HmUX3a3P0bXchM8&VY*(r}zDlQX~RgRYts$@xpC4DGxF`6}V@BaAgb07lU%=8BJ( zdf3h#k7~y$Yr84(Uu>~<#?Z$xXjDE=TJwsIj;2KCI|~1DXqE|hPRy|&3_(R}s~%L#y8EVAE@9{4zV-D)z*h#N5Be7w=wCvd|6 zkAe-MmqMbM7Obq97xpB+3RG(M(jSh;xkt?-5UQN@cweaFj;1xWQ5{^R%*L|m`RVCJ zN@(YoK>DSnc$>j2J4RZ?Io)%o{`n-WYTF#w>$6ppAzlBwMBgI(W%-Vy&pB#qA^U)U zO(wN>2IRbnH-(Q$cPyh~d(r_}C=nTzWGsW67&1AidGP|CkqEfgn&@4eY{LL#fZLKv zQHDUEgrPQ6?{@L-?6T}FGZuT^$=ZO3{nprgo`=k~ZZO`Y-Kb_3T%`g(?;RWlmFMRt z+j+Ii0lrW`?cDrE#g&FhMjO_-pLG-lgX{kFElYPhNUG*~s)Q{2c94Ch_Z#FIuLu5| z{^GK^R&__OZ1P%6lwAT)TlVWtXYxY%M5_VK2B$*^2zdoB3}GxH(ff~0qL0>x#6bG^ z$lLJt9%JN3=>+Z-5quvvK}&KnB_b``fFqC=XyrmbhwsgT6?#GQj+BWQ9SzALFs3iZ zqhz97cbWyErTt=EJKr9odZ^M$PthXZmBiz+gw1U1c?h4yOl=SqdrRAWX83awI4M`T z4<&{*+_9P{_a|^b7~P(Txtps|^yrf&Sw8$cV`k8D9B$3yXx=#O$a``)Cn*rQJYV-A*cd& z8{d{1o^-EfX=(()^$GM6w>w@Pf&Ae(Y59FmXC3g{8cb`b!9*JQ-3hu@X@XgT9EVXN zAiTo_4X-^{z4?e9oY!;Zn68{YD9I?Ovik&p zUX}ZX6&(8;9}A)q zJ5~RBbi;;*2&>g+k0q%bNSl#uS%(^xrobWUAW5gf&(zZRy^2w~*ZdiLw^;ALuam|( zDRwURStOUf3`BKWt=iwiUNdL(K1&B*=g`B8f2|WF7P1&3_JAcN%0`#7EzdV`5ksP9 zbtw@qbl-$*)zN3ljo)^{M76>#C5VS)=%Sa+_d)-(l|B<9INA-Ryiv))_7zq0_W9$(tE2|QEGezZXRi@_Ihi#`4N@|PVNq;Z4-;OgrwMYfmTLt3emM8J z>7&HMO+WcP-X+4sw1oSQP>ouWD9Yep^~2g_+Gj4v`OA9lw9&n5kGKV2jlbBzfEuN) zMp&dS-*^yV^U&Q~GV0SmuU{RGFZQaB(9&ooC_Cf_!%Pbp#bV1oPfEZ zs&o9`*Iw3stAxZybeSlfZ>R59n}#>9e7W};mO6($IT(mJKd3=T$F^Y zBW?bDeZCy5E^K9jI=D9D05;LtGzT3y_FA$fiwz~HIa%4~9XydHcs>(Er`8-SZG|fI zYb+7|``Vq8gx6W#IZQQup|^&uh^b&B+Gg|}BQAWkkv_vq8t9x7Uot=f*YGE$;ab1j zh1?Lz_&xSufXSP~BaRzfRrugx-FNmve z2Z(PHe!jfrTPRJ0FhuFDDiwaoi@hKURb=iJK%=n5$j6s9f|7GabkMf& z?eCo6x`$e$NtOYh#)xFKK2#k`In0350a+75_X+c*1-~UYhWk-eVa^dtn$6km_s+}| zME|5I6fMAS7)Z$JEFWp2!R$6Mb$<-$K|d@-&jil%)P(q(hs;vkNS3wS*$0Tp_D{ej zc>|6OlQy*?%S7@}MbW9VcQX#ua7aMvduTW<0UoS}Uz6GXmluG~<5tJdyoE2Rvi8?5 zv&T`Yd0%3d*P~G{-I^qlh|3gCLxuqAcu25n1twTq8l zhZDoGyXakCo{{8r;*XSjzF^ek)QOVyOnw-x`iUHxQi$7ppRIOqX(ibzY)~6nkNg-e z9lZ0{R+l@z+(ky$V47Tvj|?wVnl!BIf{xc|>>l;>hZRRVtm_Z%zgJH_)&FZ@F)iDg zfoKT>bfHck41E^%G&eM9{`_#A_;Gt)!b6aNn!s$HBUvVWYah|FVe1_+5cNG=jbI6z zUB7q)9gxjCsvkLS`|ligWII?0k~)%4Yayl+J_daCT}hp!AZRudLF#VOH!QnTM{2|e z_nmLB`EjYzj{!BkR1T~U4uH}0&n%WfvGrz$`1y9uS>hACMq=OYx?}u%)Z7iXhpeFY zjOXf!Ea>+(H=L+~M18soQ$|npBFSU!Ygz0d%IfGHE9>6-iB`}Rlv*+ng{B1>$sbQQ zaRw{+GjV4MW@!!-+Cs!-Yt&A!a+bOSX7KujyFd1e8>Vr!8k}s69{@GIi)Xj?!})hm zt#t=rP{EJ{r<`@a zD44QpCmPICX6^*$iF0sp8yt+NTXX_jL#Ey4h&2#-M9XQiT-_(jITo^@wZkY8{p`R8 zv1OnMMgeprktq?g6iAaCo>x>qfrXpLh+<3B4e^*8vLR#AtS*axR$kt;|6WMuR}2mB zUEb~0E)k+;zEZM#Fc62)N{)--S?_Xc-Wa9}D*Wi|r|;Qczpn-A_F#@e*3xx_@qjSk zm19kK_1Z&YguD?tZTk3VfZro_zS+MU)ILkOd4-V#7PWG8O8F0_YaHT$TJ>zY+Ror; zZJ^Zs$~i|dFZO6_T)||tkTHz!P>hk4$D&SN<2zWiqskGR}Uylh`-X{rs= zX6YHrZAv+Kx?f*9o4caVqkYUltyhu6Toz;daajNyM zPPw1C9d@?f^~u&$*{sSGlAx2ipRayn+9(C9wmngf6>!Gqaw6UtNwo~rn=Y4nlVQ5! z@C83qcy7M@YUS)Bl>ip%&v`C}H`yt_vzh8?BYJ``qKrlw6NSXjx)lG*O+$<=?i$ol zf@0*Ue43M$3?vlDRE_<1BnTvFx)<4`Vf^RWYyfEEBr0UdJOgO*h0~Id7pFhO@4Kjz zPYG$Gq(-@k;&FvPiMz_xP^KjAF@6&t0y;4R;$1>}D(Um?(( zBlc!9ga323(^}bZsrdyzt=7Ye;5DbXh)SD_fI-F7BJ~Oxr}7GmS*8^C^H+49TAvK1 zJuOA?x@b~d$rsx~qI^+tB|aH`jl4kjd|UhuIDcr-c`+Bh?Vr*YRtgu^9!Lm@BVh_l zKu%y}oua6vU$_C^j9iKvP^brMoos}rKJzuA5;4vKfa+F`x?3MZ?7An^2Oeby_>MHK zgx*1#-sE%OJb)Vs<2?v~f^e29;0aM1$ybe9A<(h!1t(fuHf*VkhHkP|y!HK;nKTo}XtK)r0Zm}32;%a%*sf5_=MA=C)6oFag8sNl{EhKsEvJIfafCq<$dG$JVdBn($5VLlDEeh7gbWH_4 zlf8JbknCI>`BTNC=9<#A|}` zRc5n4dUvBY;&2W*iQ7?x>*ansj0JP;oH&wZ)yagZLeiG?N)JB3qsnxB&hqk+yhr^a zF#G&Qy@~n*XuHf!#*j8IUMTBR4UdIPAU zEC5u#iftWeJaMH3hftFpH;uc3>jr{Pi8@a7L8dCnIPO^z55#( zlAh*%Odx=@U^HmF56dl3UaMv$+P>D}lg`vxB9uUP-dlf(76@FI$>Q&O?R>2~`?;(` zm1#6l#8_=R;}efD=eY5Mi-tb-cc?XY#KZ87O&J9af^y{}S1_tiZ-h-P_qsgi`!r~c z7NK0X@8TimBi%b3x zClo4mcr!=|Id|B5_uFX4CQg4siHu78^Llzf9U%HUNuSjWh>rv{18Zs7v>0`kdhm`M zKehGYdz*J4G*t?&5puWZMVU)X|Hco?3HSo`$5*oL`g1KfUf&N-p2cK}e4tCFX_u`@ zCJ2-#RjR>W3nWeK+)PEB?t%0Fdv$I9UfqNuHw9z)kj+fl1a&uQaXc=C)Qk6B`HBJH z>0JN#${zudz}%z;nUT*`ad#?F7wNP)qpGK-3{Ir&cat_4p%VqmUf&EIFBcdulkW%o zYA|AFWXmMm6`oh~)T+gXS(wXu&as7w2N_&Q5co;xnwpJarf<@(?8PD}W;p9-Aj zl5aNkZR@)^w(b$;1hDpf&`G2*v1hg716zff<3G0wc6|Vs*p?F@ zVD)1e!Tz58GsM|5Q~RzZ--PG~BqIfB(M+)d zX>&g|XPQDH7ySpYpNe!eNNqzC7l)t4H4XfxPlL?Pu2b(M9+ki(WlN_;e7g6v7kkaQ zK|Ma*9?bj@-tSEFd*(^=ilR=i)F)^ZWABFK8s8>{enagew~e9%m<#^@m<#=7J4~AL zJBgLTuTi9z0vQJk8K8l2>|XKc$u=}^u>9}I5LD8l!5Hz{yO%h2pj+XGJdJ&y%M=-|4|Xo9=Sf+{+oJhZ}+BO(pdYCs_H`+CB;_vO_n?&W}y~W0UaJ z7_LFNd8%;5k!KfyjOFw=aiBgLMrBVwd1%c}*AIN&jJx$0g|&yhJSbmV-{Knl)Y52Q zWQW^sj!5S!%U-l#)6as^hXT~3k-+M%v{SiwcO5wCmU`k9h8v(pF_JZ za{a}Qq@qK|%^xru7KJhQ1vxvNFLw{C=B;c#E(o%~l|=p>b; zj2*P3(06?urrYqnp9!6I^8UUQ0cIcu#+epYJq>&+nTks*FZwi;kQUTlFDm~O@0wQs z67P1Qxb1*hrC1QSr>THT_eJs%A(wRrP`-wZK#d&-YAN|xX6@wM9RLq_Rt|^E0V;^v zOaY*`=o)@`uuzldI2_17#I&lV8Q`0uPJPM+YMI3HQB z@-9oiXNv&s1kZr7`^|{Nb zhJgbj0e1~wi=Ow7B@Jd9JQ|?g6UcT*@@B!JTU?mz+37PtK+Jdqm5m+S@B2 zB&*!ZwxPADUH9_G1ao-;^KIfI&^D20nGSASv!LtWf@jORiNJ^WZ{ZJrn{FhKG2W`S zWQ%;nu^iemFC|1I#vY|he>u&0n` zfbJM7i6aUh*!IWd)NYsG_vsQ5(jG$&kOlnlUzzMND4c@NaEL4D*qSXe}Smw({uR&noZzU+%8_JSZw!nkD-EC=oRh5RAgj#YJ zMkB!P`Pcgbd+vPl+<`;Mc7xJ635l%iMj)YE+%b+=;e66-Ar8k4N8v+F4fhI|;0Lp0 zNx1B{-W^;rsg;QX;w#t3pv?7wcQHDW3}s7M^V_=-?mJUlR9zU4ynJqwK~oHO<5DM0 zN_3`V99^HwkF59q7xTFJLJeb~1gjg|ci5t^f;vNRT?DsMG)^0ZYcBtT?Ut`rkq|Fd z#NDP#0>;En(L`=x=3+VQ{Z%C|An)RFE)sUqlJJW#oTrKSzo>Q5_sSwnuQeV>?<5f# zTX1XxlpXaAqZ%+tEXY$jUR#|wLYc^`AEI^bYr*X2pxZ6}DhFS2`S|&}Gt)+GQXu`2OFYuHfUjkPF%r4hpaUoiU$$C?v&OjkA$`Rl=x`2n`8~y~~ zisf-{1fv%Ttr8!2z4dy}_G-Zy<{gXP4-lyR_sabBWNa{%WY`?O@Ng7%}O~DBjy<@`3B-)o{6jFDCAoK<%yGMLO(Tt19-=YT!mB*0)?lt{~B3L z(k&S?C)XZKF3Y2gyhZGFp&M^O;a;KHa?`lli=7&Iz{|b@Ju6VhM6uJ+xKWCX{Z82qCi-%)1}Pp0&lX|__oGbCvMxBo!#AY z(0WJ*Dk_Dn3Xtj_ejRh_Nhh)C;52|nmEvzK1Wj4L#Nbz(cX*5%+G)85P%j*@_%=O7 zG-jMuiC&EsCh0Xi=*QwSn9mJ^!bD3Q@}HWUAum~sF-lpUC{~D;_B*c0Fa^Tt3mIBX z29K8ybcikH8lU8l77Ol9Regdg69{+kNZrnlBA^XFimXUhow1Awq4l47AwQ-a`5)7s zm;YLM^Yd<0+K>`Gl6$B^2I3Vuj3@kgw%msd;LPEb8R$7>u`PgZQ_SnRKVw{Vv~a7P z_OFdiwe#-vgkq3pRglmuXeE0ASa&D~Sm%5!VYft{o2!r&3aaIy;PO(OMd*GbUHwN0 z8A#VO1u`bx?AOoDK6_jk$N$<~`T-I&j`(fYo;#hdXA#kOvaBJO_w_421-nh+pSw-9 z4>}ShT19coMBQ{OXh^4kPDH794;!tBE%S`)UC%K2Wb>IRp+7AJ_m2%mw>drxbhmDx zzexZkGluijWyFcW8oFPngYipailHU~R@Q&bnSt_%>MGpa$JpX(S)3KPV{6i@FI}sCPO2qkN7P#$l zZ8yF>1Lh-2D7P6&CD{em1-=PwJuAgMGX2S0lO#5!WOwMi6#r{U+-taLm5~;DBQTwQ zO~@7K?*O-mMPl~`eYgTuw!ryg0O<8n-By%W{i&Y#rUvwF(5)!c>l^q>{PmQn>6)@H z2S^XU!8nUsroj$;9^Sm(I;K?c^RyZ%+>xRI{c1m;9=tjn#@)D zx64Q4sA|33S%`eTf($j*c5(xt&x9OQyfNT6a;4Kix{Q?NKNIEuJ_k#yT$pG!`8fm^ zhe{tA$%qO-=@lyat6Og#v`hZfgz0AS^l@rRQ|v+)v8{C9c1A05s!PG59zn&KY4U3K zt8=zPHisPzDDac&C7FCA1!dJN|5#zA3B{GNy!Lj6h+PkE>Q>a1SPZ`1LF*ANzBS~d z)DO8JE26N!B~M)5n{S5ZIH#ouu_e%!%C47bMl}r$7@@v@q&B+4ap+nbO(?~5;dH&n z&nUo|P(-<-OvW@|xCZp7NKny3t!4Yj`rd}n^_c(q&-Ku-H^5!x+|-b{VE`o$JF7U# zQFfvx^3hvd=3MD?P-Z@Fr{_Nz(>nmY%sD7GSGyj0m9`&yB%##Y)B$4HpKHr&Zy>Pg zsxpcoUv8_jgYk&9j)kiRa}AimqS`_TBB55tBJEmDcZ^~ZlD$Q;PSAKo|#*>f7Wpk%`R6G`1RmZ3RFKm zHE*~&$^PJZN|bzxE^zrA`4V)QsPELBYQ>xauVpL-#nTKr|Ac3NaV9$T;r=KUPY#|g zR@fKiS%vv@hV!vN@6sihWc|U6X9wFAvt2|EQ)JM5#V?0|e$S%rI5sH7V!FC~(NO7! zPK(|ld!t)!TEwGN53H<|54CcCc>!`O2%+v-Tq(Yvz&>xmyh`P@Tx!-0uA&ioC1mp- zYR9KGSUh%bU4;koaVx`_ib>X;uvwa!4VqP$Ya_m*z(c7Z#;`M8qa1huFp4;^AP!SK z1vo5_`#sb(2wHNh^gRPYGDljUf~H(7*HCqdf^X?hP-&I&RdTD_LW$_T5zyq16ZzfL zK$9gC)RUW6SS?{*yZx_^AhZ(0V%tcOgqKY8cS%jqEFAY-*7LWEi-F`IvLwxPlGY&mWN z?ago3Z;922gyXIKA%IZi;`q3h!Kx++_UwY!14zU&(2n(^?w$c|;+sei0TcGx6d{&{ z5bC)T9`G~j1t}&&NkOphHLZK^B$9RU+$_3@S%tD#*F9n#^6V25uw4sBku4cSWv)$a zQlCpIl>6X!pd?}}CW*_W@jQhL4qnRv=?d&c2_OJP&&_~}XD;y=1&6NVLl{3D+I=>0 z#jgMaP|7Cr(h(v!g_HJ@`H=C;5$iCB?_mNWL-A@GbWSVX*sc#gar3i8_gN(5v?Nu$ z+N%`nt^h6o?d@1v1?nny-Zzd=jghRd0Q+tC)#+?elaov$XGEfacD?H(!yY_n13x`@ zI_$OTI2xeSQD2H({pQ7`)s51*YU%VB*J<3hXLr1;zQxXZ%$3bVV;;*W|_6k*|Th`d)Zu4C7Hj@VyP@cf9v5rNg@2^_O+nX4n~T$ZNe zX@L*R582d;HX7*JKSW^QHl`|m>WFcI*|OkO)#&{~zm&5r$=LAs`!A4)JOPE+G3Lvo z4|sY-=zR$d__qcJV)vB$; z+<;_^;RTM_WB?xs{XkHq(;r4;rWpnm76(_~>D*D3{Yg~3R!aJ1 zMNXi=077u(GWKf<9a!4hUi7M>~S~7|NvEF2KHt~>Qwb*WMYiD5odEk{cDZTrSfX z{e9ink!Zv)Nyi#zp+Iaig?V2#(fj#R;-VD%yNixQd}|Fi*EN@5Q6ye0k~o?rVJ>te zVfygb?m^$wph_AIUg2Rt^$4RmRQdy?UwkY-P&&Bb^5AXtcR&(6$z|}Nf}_1f4nR8H z*$ceCEX#(3FeKzeI&8+MWE4s@P{Z;(Hz<2DCbEMp<5Ym{+O)Nr!WGd1;qr$UIL}k^ z!*k$QMj~tShF~sSH~vE~zGC1s@3yS{Z~rY(^rxXGqWcr*6Xvz&N&H4NwE7yaFJ7-d7D6$ z%*;xkEHo?_s&eDe^WJc}@Pfl3*ZGHoWrm!P-&LjBe*1nL>N$)aA}+7~xB?rK%;7Wi zYIRBU*f3Ou|1!=t9vF!By0>zP8?@{r#LxdciRc47+9HCqLHS;~cL#<-Nwr0aq9qd` zAhijvhwUnsky7bcp@}{}m=9rN2)lFj^t*%r{^zMQUb>guKyql=Ex#coEb_3|l8F3- zt9E@D`RT7~9<>s4WIF{|u9`G#-moG(TbECu5US#?AWWmFM9PL*_GyYpNqml$fA4S) zyoZ2;RqUTw#d1QCyoKv&+nMyunH^Ps$^JfQGqK%bgiE{%a$#-OrT+-6Fv-5cNBmLM zp`>VzTOb$6=Z4pgZHIZ2UsCoKk(kVM!`-zC99~N1bNM#D9FYL2bBLm9`XSi%SpGJ3 z%cl+%HW?_}TWJ9$ld9J$KYh04>{zl_z3K}LIOG=?UMKE`e84U6Q4qEi)y;n!RBIPg z)hrIJE%X1^UBbdt$&oCc`Ea&(a5k(9GyposQh#pRVRm~=9TbpQ@mf%5O4j$!(T71t zPc#G&vIgm~+&6dUt~02&IQ|Qe5+SWm<&esOUn1A091xP(=CE~FCzH!b4m0~R^syZe ze@s5k@5!fw=ia6xgx1UE7GUmedW#4o8V})sMQf4Ou;=+aLuI2ifI!xBUvAqCpqsK;?TAKaGW`D6@}RcM5j`;dpbrf((bHQ|hPqUTJ$Dyv=dDr+Av z1=RNied`gp=Lg<(srV?sxx_mydx`&1CubJ@c)lM-(Igcb+Eos>RontlbVHqo&^O5( z+qZ`j$)^eXE+qpRSNSdxH)WireSG{t@h|xO&-;PZZgol-DW}Xa%<*yXJ4~?5kfXmt zxZtJc@qHdKp2H#;xc)@=&(|n$-iB_jxEyah*{zCqlB!Swz%h08pEnEBjEAh^;e$b~ zjLziJqgN}w1Pw5yuP52L+%GGkx|Mm&HoGTc(vSCMaIiLLKz=bJP=e~v-d7JBSU7)` z#}nL*XTyH%imje}Yn+Hg2|`)Y39-4l0f)et2d$QRf_(}yP@!mItco=94-U?78)d;2 zs^Fsr{+15EAGScx7K z*l{`TrRD>doOzw%?oi@-r>+YvG}=@|{tuc+quf0&Si5ZK-RjiF?bXqqON5iuhJ!A8 zC|Rxw2uVsRDyq}_UT#nb({OVo8-z`(YshBS`j{m&zkW=4I)Mm{FC>i>a9pFUV%B@)KONj%*TV+?b z6u$S5pkDY#+^m11ui<3==_!=^u}6--7S*kQq(R0+@if?ScoO>RF{sZ|*(sj(7lrtb z*x|FsS?AiF$tZhKxYzz9f*K20g15iGOiOW$OQ`(Vj`;uBjxoeF&dbw%@NpPqiSR6| z4a9t@J4r16!`OSrQ~Ce@<0Y~R5t5zA2t~zFwjv~CuOdRqIQEE_QAS3{K1N1m7Akvh z$|@v8_Gnp!jNjvVv|g{z`+K|n{<~h|9M`$7>-oGMkNbKgidJ0T5)IGoiKlVe)PC}?(yJn zA7^751S?l$Zc;fa{?x@)Q=L0?H6E@5!{~1kUL?c%r{Xy<aBO5{A@}iU!(~Esfk5>ZNKN z=FleC06DR0g50pz)-4304$KQ(=GxX0{rT)qQArJn&M4FK;Lo=nky+&*$&0L6DdlHA z*PcgMN;}&9K$e(6%)uhw#onbix7eZRSgha}9#_=<{X2fCBen>iGVCW{&b&CM7okz8 z5#L&@vNkoBVR`@o1W+18p1A%-;f2jGi(@$i<(TYd5EMiTZ}Q*LqWz(G{{9EY$T?iv zwY2r6>_Rk1D2&jYk8p_%7Tz=fKh*xMHQ5Bdi&A zm~_yY3qh_M?LcV9#bgP7(4@v(NI2C3;KIEu{zo3#hK^B#E@L6;`6g#)-dB{dX;p%_ zUdV09LbuAZ;tj&b$t{o6(w#=GRvQ>^}p3 z%;OdJb0ZULF@OC(t`@PEe%FUs53k({w1!>rc))p|Gj>lW& zsWlMqIgmogKl_aOe|w$%A9)TI+jT#@z_{E`%<7S1Eq&IbNW4b>mjd0&+@Aas}J- zRzrT~FADCYYFg&?^IOt8^u+YAoSjuaKA+A%Q%Cg{;nW~+IHJ8ru=rBI-HI+*e~L`o zXkWi_j$!{*%9g>B=aBCfzDgbr9E

Q%lKy={J0%}QhHz+uK>DR2H1LoD8X6^^{mNO^vp z5rpSm55<-1uM6u*;LeP_snYrMt|{zP#Wygse&Z0hYFG1G2O!+~cjZ>MN!eeL)DG>X zCv&({JZ+S;vMVLwE9>%4wR4cl$?#F!@MY%cxnnl$9#q%~>87p#QBuN3J|h~`d^KYs zwxngvp-f_@jBD*fSIn8b0`>;KJrSfz&UA}JP%zZ4UM(abvGJOQ%X4;({F?nHuX%<6 zY0z|e1*PO0Jk($p@^7fYJLO!!ZkA(Ik(E$U(Ejm0pZ))*jhNtZ1eKEK~A@d>y;#8A zgc%h*Nl4m6OaVZ4Ud1=PRgl7Ez-BzjUBJ1`2UGsLbhFnQzo7_~TS8$TR8APOB`8}B zZ4Fq(LN4jE)n{uz$+P&7PR*niKx07Fwy`>uoE;MTt5@Fp->(|Gq5k(*LB+u{9|C=z zWQG_Kkyl*{AJtcdpOix^M@w`tVtjp1|GvE2LVVpo`*15oiy^9mA)I~Zh3P=$8%i)F z*1`BFd}Plji%f_tojqfRI~kfr6en~96@-fi|6LGH<^2~$d#P&rld@&`&)L3g?{|3VdiY{qO+1D9)jof(2I(3@tq?qMy=L47~abo8DY2;=bIpg47+amBYuqFOMlU8>K z>V=m6`#|O3IK-!h67EO0(r$_#s52cEvxKXFX`-E$@^x86)l1?a7P6~(z!iC)S5r}- zVo6SEP-i{zK0aaL3++44Ohe%JeMBY;RIOu$3mde$6H#}}GxA3vK`buZuX)BUw7*?> z%yBVf#AQg8Ea?cHc^}IXDxYaCo?++H%DHFtq(Tc)XZSBYR1OC?;0vU5hmTa4?tK7R zLg4Q${tF!uzH7(~4)QxH`pn<&biNcaDjMD$+Q*-HWtD*30ry@#?c82R%YD>@Ymg{2 zjtB!3VG%80$O|PWWv@~+1wVSiFV+18yCpGvoc)<0Lddf8Q(q{SWS<0ruZe`}dAA1> zk~UYoDFiXr6dP@qHxH56>6SU=-gsZGiz0wGE5K*Tqu;x}nz9-GG@Twi{YDn69c@Hi zlBWeZTl&fQw_IELe;()FFE^^kD25kGwi_vh+mjs!%mwBBAWjJqJWRo%<->RmmDsKy zZqtv)nJ1~SEJ4JxYa;Pv2OGZE+y6&BAwJE}pI06P`<>a( z5FPR|PFVhSF#^F{JJ<)IFt@SZ-5;R_Cwa}*e4O=+q@Spf?f6V+22fF30|GMSz2;>7t3P@ZcgxdbwJ62av|nbeMy zpw{@aTeSfLf+yl;#{Pn7Iux8}HE>s72zR^C7|%1=9O7<#!og0!EYGi!bVe5l=e$!v zp$AVYE=+X@q9Q`$iIKRcU@S)+T8_ewePU)tPw0~45~N?4-beexAj1T)pxOz@2?!51q{KT28EW#PZQk-jE`$0M z?gyz=zI_|X*oE;k_T_=;GX~mMDAzy{k>RzqxlmNKaluf5u%`E!<)spU(I%j!iAJ!@ z_1+xI|Am{)NK=*>O5>^)K2Kdw^$D_fb3!NE?@v4=M!W&VnEE3X=KkVmt!_fUv;cN# zNSp>R)zDD zW&84qmpwyDE+@P9+u<0T>{1rmq%{YMsIe;2fLkCe3l9jZqwN=@(`TWT&K&KADxV)( zrBL!D1|cr|TdVQ6RHT*o!7=m{ez!DKint!F_;%diuAG$7;C_%OK8JvU$u~5M%1Xj< zR>Scsw>4cN;KXdFn7!cjZ1gf5CE~xo_1&-p6n8+Z8GWw&ffuD9?YxjleSn6Unlp%h zFKv8+{CaRE@7FDN$=}bbp#au|Xb_}A*lms<y-f(E0Dl&BE@LYq$DfglL1!i$K4I}qYC!c2)sb?rI)YDvShBUEHfp>f8h}_n^ z)EvM4L!Pfv`kBroQG8^5Hoa~2q^yXD$Hj3??QhRJGu`!Y z0wroQo*T=clvt697qLNMH<1t-)Lhsh!3TEmNRaKeLR{E5{2kO0b85f-CCRuBNIa5c zM1VXLSZp6OH)z>~3#*$lW4I+k&^7UU4Y41H2 zFO+~qMZJfa50c3~Madrifk?@3ukAZSFI9GDC}rDu0*VvFz;fe-^-$I~iV>v97z4qS z`9$C}(MNS)d=}p0PuQ)FItV!`~(0;Y|ZcD_Z@)9Ov*CCAA6uSKd z_;gm)|HHB&dV8`>unJ|*Kf0a;9c?J$BPo|_b_FDFVxHeE{RnbsmVekLzklrKfJa}0 zUa96UkGk)6)Kr zXyuQLI?%bK0GIkPUGWk8Ka|QLv0M$s8D^nE)1?P|&U?OS5N~#dF1YnNlTA8uIQN{~_Mx z?$ac$CsE(?!?Fm(m@PeHJL|1LIz(=FGegJVbJdn7ZM!`L&yBCg`U1%Dxa}dbpgfZL z{`BEmYwPW|^rt#*H}?w*T_$SoN5>y={B}uG3rgu=Bt$_$Nptr4|18c$YdrNB$JKVI zrN|B?f zz3KaRtmB1={+xXom~`z$GWBvc$}Ey|Q5moA?x>AOAbh*(4CR()E1-+@pj^i`zL_JXK{zt0k|JdqnNat@$~0 zDyttASI=F)T|9_w^eF=l*iIw}rkq?r)R^mJocVoX=J^hV0Gg)r$G(26B@gxM zqz!QS(INnbe6=H$Z^h?aR9&VY_!ExaYR6$(QW=vj-w2gEK)x_?!zn=N@C~-xeM179 z^AN=N(=(E_?HQqGaph|m^?7t0mvi+ti|&VGqBFaHp(KR*PNOlClv4kR3bF`|G;cq@ zWyjiD$?+&S$bKndkgu8!-^KaDu|&4*9lTy` z0T%-VSpzQND^9=go+jM4&y~)Bt+a1_tsTc9Z2~GUI?iuUP(}?_=_JVQ)AKH#RWPQk z3Hj~s7yRk(Kb43qC^yw_KlNj4kh<6D1POL;yf}WsCsZp`Gt+#3S^HVmAuK~33hk^j zU-B1LE&GBT(hX3GHbGwh(EENXUENSx&I82aD|Yc| zPyPY@5}+L3pBv#9X_^a(86F(}7=tUqj(-e^85pFR_=>CZrRZ?|&XtjYz5OPJp){9I zL89FTAF5?f#-~j%#Nr!j-?i@`!9{^u5uA+v;6?d&xPZ8G0pupV{Nar0mCXfIlf@@U zV@No~KKv^RK&44<)*_O?5Cu45e=F@5epuD!eH<-U%#>MJ@?9RU-elFB&Ym$fDer2m zcY5{G0pBqe$Gs-w*~yC=_xVaX_gD@W`FT=HO*pn+F%?+GBdEapZ9@I!ET4Xn4Q|dB zzCy`MNyyfc9czDh=#yB!Ntswq0A(p@ptQd>BmK?~!SnFye@I3dNDBa;Vt)`S9w7}t zA?{}y0FlAyioozween537y_H0z$pU3T==J~Vtq{iFw=kQE-hEGL&&~Q&5is~Aa{IVgGv|*^Qsrmp zpl0c4h-XeiuyqoHNtj6Ietp*$qXo_b+6UbBK z@}YbIs+&Rg4xK}?cxv>*-;Iy-r`9l0;^mZ2YtMJTnA?8XyL?%|5ygzTXg7cD9|nFA z=(`xk{>%7~{FHK0QdOIbUg4H(@#`ESOOXK1r$cN@gE5@u;4zw? z0#M`uhcQVp>{DgQJ11YN;`?3tKm9II>E*J5VG1(4O|XT7KfI{Lrp$V7%D0APR|wJ)9Zy?Csox%-A)a%ko&uuwMjWoLrhk4v z@|L!MT__!Ba`jX>QbG_qqOZY_sNznWp+tf-hb+|8% zM*6rXvk|7L>>qB=yGtKS&F%+y7)^=nySY%qcJi#ecctCfxlG-{hbL~n(?F?HR3B6V zk52~*y|V3B7jvf9^>x({$*GY zeAA8-(q0f6#R($LYuCd5hXborzYOcXGkic7JCHZ9HaK~;Y+C~-Kv=3@4;tif1iCJ=K#z3Pa9VYO9FWLtvUm1`6Oq$3qse8; zV2~;-aQkhFs^G1+J z?+hWoOGuI$;HODSO+|fHV?W2*MG5KsH9OQe*v_dEs$Jc2Hadk$tN*wU*+HqDCrJnz zzqsBVOgb%o%cjq8R^|JWe#cmvO5a-c?Ckd>_gM|I#L8nA4+7>8cCEXZJ))45z(AYp zv3g3=6F&D~dr5k#nRe>FLIHrgnlT3?hq}Fq#MV2E`M*}`uLaHN`v1)KRM`vmDHx1#7&P{pH^C)+3Ggje zt0K^5VnW0@LF&M}D7Sd_MuR@%}Ffm2whmqx!U&5KGXO;aGPt{GUm; zP(ooMw-d33m@={@m>t7gW$(d$4=I2V5}5%x3;P@sU!@bk@EnV4-jZj=Z}{x!iz3a6 zu|@3+$6W*;Md7SlSRcy8B>c%}FYZV}wJZ(l?zr`6=ZkO^-5jpHB>3`;03430`FpiVS`7UYI6&Y%|irrJ@ zCnqA-%zfqM;);{f9$aW&?dVra(GS-bDElacaLm~m)~yx!)uAt*M}SO(ZFs)nA#$afE@pow zAI6eZ4{>BY8FIEZp!IRhq0{MAy3cKFlBZ)%1}~(rYt+TZ!+)*{IuCCO?P&RHxzvMH zK??l7Vkg^Z&xOwg!Bv1Kh?2n2oWRy%|M9Lw2w__^1^dkc=tq$+WdfuCF@W8)K{f+7 zjDO21U+&dc&s67hl(mGIw*`%{Xd6%k2!Z$3TS*0)Y7GOYxgB$WSeQZ2f`!vAk*5(5#$Vr2tO#(Gz?$kB`Stf`c!8qMIMgqalFoE&J04#d-k?> z#7@SJTu!)pTs8kAc+kDqunXe zyBI*WfRs8AgO1YjqG6$*hXTO#?Nj5{QOe|11oc4BMpCwQ(cb|Yrsn3^cB3K0Z%22M zunXH&({D}N=7;$b7|vNyiC*Hhrt&79P~X{P6=AJ{!ibDW-f}#DlNHVRedryRaXMWd zpPn%Y_UMnHcA}G(#Sl7G7Y#pgR)L)>K&qurqVP6JC0}Tj2lG7<4koB2;n%o0b{MH7it0JJ$kCuql-9C9^C5 zgI`6_{D!vxFPO-N6p)dfB$OD07=V>UwQ_Bi&rBT69CTCYf}HUrW``Mff#e`YzbR=K zZovHip=IO45dSmvz(RV2i8XVgvhrq>yHctKRpXqo+l>7eBCUVQVG(={CSUqAP^1K+ zka0klv~%CP&E+nDKcx-EX$S%UbRTSj3IMwGm{Uzr6;p+4t$|62X;In(4eu<01VqK) z0hRe8C@wR-sM?rGKeNECyF|JB8{u8|R1XM-U+%F6A&W_mBtDvcsy9y{24dE@ol$5f zS1`|HuENOJjGZAcq3Cau>WdCxLuQ)tYCk~I{}2#%<1u#xwA@Uni~ zRCipQbb3C%;AY;zcN~UX#2(pEG%nXtOvD^g`(zz-s+w93I%USkQCddzS(tOm-#QyY zShQcKhg~3dJo*bzep;U)RC&Fi6^h&Z9ei_lm=v_gr&vGFB>VrfLIaY_o%c{?$+IC_R z*s2}Pe6GJ2dmS!(==tF34QB6QjDhZzlQu(7@6bJUV0zM9%_L-3EW1euN%&OKmA#RRm9{^yqStfJJkfuwcH3_ z`7ou4oCu*8HjFKK(O0i1HnI;jCfqpFA~EF>xsPnh9(tnbFI8KW)V-o|q&5B)d{i!h zwLGIX5Gf>Q}ouNICVri1pr)$y;nsQ#jPy(tHO_v+Q=9%}92tJ8*np zudm61SThEbb_`o==kzRvgFqoy@ARKw1F)BN6aqO}ce3O3Ftc)lEJ6 z#d;OD21{pRF7$Y{`w>r(bluN#uAPkdFCF7*gIv!~@`wxDJ(~5&#(9B3{kxx z=?G8|vIYqJ?!Abo>m1*Tm$hoF;4~Ws?x0+w-Fpw-zfZpA(?U!?xA4$(?7|7{XDLU1 z?Dkv4h&;3v<$^r>?Z=10|7wAr1Ex-o!Z=hlNMYCMlaYDaZU?tyxz;U>n$* zU@^MZjM>5*5kE|399(zXHtHXe7py+dN?8RreJIE!UfbH$?DC16Q@(eK?WLV1wZJ~I z(|FuOXY&GzQ1nf*7Ad+jBmmLVawY6OJG^#4RYNsW+NyN2*;w@v=8!o}wunvQIIaX2 zN#>B^edUf4*=WoY!!oV2T;4V-QQAg4Fm>VX9Txo-5ma=gOr2BySpdgN$h3M2KTbxM z*PQgL`~Ci5j!S9ZKaHAv=>#$%FK7B2VR-1upQ%(OEn-kV0kt8 zaCEmZ79t|gm%=6Pys)>O^`khupukX;*eBI#0N`^Olf}UP~(k6LnF;=7wd-iwf zp^P;QWIx;(1YRK;0sfTlGelRBenBxy3F9lvZpwG(>wG~aU2%9Iv=Z#AG(psVXLq@!g9OM8EusN z^&l0sKgKluNC=yDEqP{rV8{mhX6$)D;gC;B5~j*^swWaEXx*n#8bGqMybAv6?~l6x zD?mv4q!)}=94m`N$DeCtb?$V)@mfOcS|zc-XfM z;}-RgP4-#t#|^(syv*;7`~!hokvS=YkKg185g?hfe=I1HMHf`s)M;*9w;1 z%^F8T2*}-Xuj^8?!nq{8UrfmPjvGerfHUZBF2ViF5}hu-Zvo|7_fun89srDk6`|g& zW3Sg!k3ehqbp%w5Es%Q3H8Tlwb!OUJCr3jHY?rN1?s&8^31I+3aIcY^mA|ZbALU)E#*quCv_NUR3LRL{8EA##C z@`IxZi|+ehDT53&fot$G5i5Uol%n>hqIJXT?`OVn z$3#a=(MFN_4k$T2KiVYi*ORBMc%zT+xYue+KdZ=-40W{HxZ?`DerW-=AkHFfP#>ob zEMbP~0~sUNV_xz2?kiQ@&i5Du!uB0D6pl6&6rWsod*FELOKMM9R24=&O)qL3)yWFZ zN`K>xOlij;c9JPL+1e7@&Yl6f z`RaWp`Cwdu1by9Ar~pmj9RLj(&BU6|0fwOA_?Wy<2zHtOoP7dAw}ruju{|n(h(L( zdQ4sq58S-_bJ_Je(;m!)2^;8*SW$R~j_;RIGvBk$GV>E#dz0yn6W8kmwPV(L2k5Ce z5hF}+3yNYO?24Mv@N9$2@h4v|#=MSWn>^2tR>n(W5w4$0Q*(=MNmf2An<~imX3b93 zm3U`BU7U|#sw%r&0ozWpU2RuxXCYdHdX*iT5=D|Tr%&sH6Pq0_%Az>?(b#wRmY?-c z%fCsu_G5sunPyQ!m3=>-dDQ*i{>LHyp6kDJ1~cKD1g$=P2EO}4#; zVfKwwI}87`mJ`$N+gA7)X;84JTk}Ih_QtsVj3p$Uh{w15xnWs?ou35wxhI_X9w#SZ zJvUSrjqczogcmRQ;BJvDfrvq6b0Mbc@!P3@L4v;M%kU3N;@=AiUgzwAO@O8`B=F}x zs(CRb!FK!Cc`Y)YZNm+7bKgxrWX?|?1Nm^Ml`w-2)Ky_)KFsSrtOt47d%5Fa0bH_p z`snw20X>$Rwx=e#WxO}@g{aOYu80TS5yiMaOF^b&p&M_HX?+MS==ZMgE5WvWlu(g$>zgiZ zcRnapX4z}V<^rCEfv@x)sB{z{k>dzT$Jh_)m{FHu3>Q+NA$19g@&RKE=i$%yJ8a6n zom*e($OYw#^=c@KIB&<9#bC${_Wb_Dz3R0>-7+zA#uSdWvF#~1<#b)NpxqzjGT5`c=khvpM zNhKX~bL4MKe5K5(S z@Pq{(04Js4QhuSN1vhddFVgTD@O6a*tbp?T);d3 z1B_ajMrjx;&|x;-98QpbP_PpvTwtbf_6vSC!K|uKp_rnHf+mO4LAu1P&R7_2)M3+0qD zWiHKDlA=|Ak7B~qGZBf^Gt2PIri1KO0fvGY6ncV9r#1dYQHrcBiaDv=+H^;AH3mTo zgYD7Op2#AmbMeFao!+y+CXJINk@ZL|UuN6M2hzomp5YM{UoSvuAGcNyU4=I&78aAi za@}YW_b}YGThH}DHlwovc`EM!3xL3p@kV5mYE7JiV9US>#5SN5dm;NtWqy3%h}iH3 zn``kt;s=i)%zIha^;L<*vb8I?tR|pPlObIl;=j4;@|p4Hl(@@(M=R|04)sm0+W2Ji z{B12!qcuiFgF9TxrYrTflxXcC5|1*@imba|wrUq4u9XPeuzn zMj*moyt8I&Nb$-L;4ln;z466>{i%Us>m5dy={X&tFd8Q+wUf(s^bO_0sJpvaTV>ky z`=1VKhR^A}xi{KCMn9rvZvm1RE)vnRoZSo3lxebogkm`}PebYX^)ycI9G+GZD3L2M z)PI8cj5(VW&hel{<9m9m>ru>bDAL%44OV(MVQ$`jtSW0K4+(IYmgV;{>do4%+?+s3c%6U<-n2xc)TG1;#*Qzx=S-kkoX@SyeZlMlKS5_49w(xU zS3&bjYh`rQ`G6ecp7sF%sX`w!uCMo$DYKY;H)FWFC6ri0xY7&Qq_P_cSWD^a8rc!S zA6-i{1f0K}x*7SYH2iAWOjVv}RhNg?>kz2l~C+!0EX}K|p-9qWlD4eyH z+GqbRX+V!7f)5;**$^3&U=s}=$)@n83=bW+$lm$^pPdS#uh}W_?}KTFTB{d6M?*|U z^pi5j1HIcQOFQgXxs9KqzrZUm>f5J*?0k+j3Y}5o=9}fH#kalM-LK^{#Xj{5&u@2) z&K*wRd$Agjl$&nnG;xLq_&M{T?(rNn1rSToc(KRy=}IyxpIq3OsBpqz*`x zZJo-gvTgcjMc$`@dDT{R|K`*;!Y(gh+w$dA^CkQQLGaq!pUfpT9Qg6-cLp`Wp5qu!B$y| z9CL(YfBV!-J43i`QtWr{@~Vx02qKQYa4ik%o6E5;|Fh*rL91e5z_xVd*6p@NL9f-e zGX$$3_`Yzkp@ID9igH~8QGOqP_*{lZK{*w$H6q%^taU2IDa zHhyuz8yFfNW4egnEunzQL^;{=|T1wHn6|!Xfw%hld+?|bk>1R&eB_aBMz_nw?oL8BwP zUQ+*L6nRB7pM0Qkobtmt{v2UB6U}9A2*VwT=%#Dp&1W3&UDY1!&%cU&TpL9sz#rXH z?%Z?^aeNQ19GOx*#+B=bi#$ZexeMd(bKH^>Xlv`WXVyQI-{`@PIpdq;aIjls7u3@B zO?$)fwKO$n^OV34FS+&?tpLWJtiv4cMtQUnYR)Rnx_`B3y%2VK`HD9hki!7kTvr9B zeOYe!s8^hTuM)1ue^v96qgj;$iATkqi*FJmm8Wvrcr0K+!aY8NnF9eG-%}~C(ZD0v z)s(SoU{q~Sa(e=36UZ9mAyA*@S@O~3RD4r&0FQe}J;Xi*t2ahtTEPi{1V*uDt9q zbw#d@%Y4LPEZPbebYFd1j0(gR78YJOs@o^lf;l*jxb9oz4h|d9`qts}mp#-VbwB`O z)dE2F&xu91ifsx{bDO>^b@OP)>Q*kN$XLF+t^#C`_2}s~#|FpSCsQ~(wR3dXf$3{~ zPC?0Xh8x8xtuY!H%)ZII>Xulr>4C7Ry(`Kz>P&)tnz4iuo1?MHN#DIkhShXm@uYL7 z*BaCX&W=i0#((W!!vkl|rw_k_*;cDll3P50@(0~Hq#uYwkVx~TUl0K-cXF{Qm&=O> zX%(BlSX)}X>>IS~B@>%kv9m8<-;4*^BuZ9O37<@0LA?`P>&jUrL(-b3bCWzPLzXG7e&l6^X++JJPF}w83Hs0+q+(8&YC9d^&CB0`Iw5z3i0P zr#sgVF}x`$p`J0iBo^E^7dyCMUv9iT6ci*aMUE3LYF*$)Hic*cy$x?kv9w}~2a0XV zu!5BtVJIg&-x@~R^ABhcO0rK}|NPj;)=IVX1(#cttr=`hR)}DPFs%zzUpYH3IQTg} z1gj=Iry-I%Kwcr0m6vVC_^l(NA9=HVL)6U-YZ4pC z5(DgNtC+fU#hyK;oqc-Ato#8*Ylcv$$iZ$90dSFYHZ&0S?15Rm0JThlCM<#N9@!k3 zztoYR`ro`1rm2wF-?r9SO9y`1u6wnZ!{8mrteAe%VbGOx`_EER|+y8aj}bFK%9Ck7yK@@4FCSt`eeFtq|ZmFV0kpsM#jFo*HEUzmg79sy9A zM42bKo=`o%qKuLNv59PuHMahMK=aIXGe|~`POn& zK@z-y)y31Nk-LOb*+zdcnU7XPZY%A)k-Bv2xH-nCZiwKP<8$b?`KWCYx<#9w{s@dZF$Di1Vne`(XlPajRbEueZKdF-!3Dr-6ieEw zISEO%+=`Q{)jeOG>}$Tyb15AN?%Bd$Dxt4mUL4WMrkNOkfjd_bA*;WGZ~4{jvaP^%vw#S;@1F8$)brSOu>FI832@78WnSWPdaW>hdAE{_+UG`KNAM#cS!p!yO6+s7Z7WHVlf znc0jPJ~~|;*y;atK>!d4K|UJ1jWx|-wY@mUXXE;&{OzX z`CVKC+a*6Dz(9~hPI%Jb#E36N@)C&9xclPK92hjvgp0vn6VP2=5M(U<8ffA}E})7A zn!t=)6p%A(EAXD8n%)z}3B#~R5B#u5JH1X{uT{tE3JrT@9u0dg+Ln8^xbo3+nH!us z;XI?ITkKXeODMoNJCq;pP?2hsq;k7IFJ@n%;olVsI=*nWJ)G15weYgGSN4(m?31Cn zat}=&hL|)};`7Fc4YKXl!{e?+pm=JgNh7%U5>H^c`T&xC=hZkW*hHFf)NV1Ur{=ET48#1P9n`|yb7K&Jt zB#=lhhE<6^Bs-O*d49K}wLpIlwtYY~WwGY{K-NNX!tiCJv^#8$zv7nONftU^^stN^rd^fkAm%Z{dFse|o zHKqL(q1`LTh+!wkS*Nx-CtvC}d>7l@@L5JFgv+gu4ox*17%Wj8*XG3&R$71e{1w z_4-Tsr5l{UE)#!dni4rC^GivCf>N}I#&N!Z1cp{ErcPX_{z}I83HK}vKkS&k3leA` zmNg@7jTzo##(`v+@0ozDxc{B*=Z7H6>ZgoW-~8QKyTIM#`duJ4$V2dubwJI^Omw?x zd!WF<+MS(wHwC^;n160O6N#D2YdE$LZhv5ZWS?|<3&U+N8(2zHUS&2@W=3`6NqjxO zV=hZ^Ii6RjtyZ(lgnsXb9ozy3vkU~LHUp0hOu11+KD0N)*JF68Ho4mg4dKVMaOzwM zjEA5m``SI++iJ!7F^qKa7{x!cOOb!;!N+rCd0beq4~fKAcw18rqT=@k6u(Sd-l*Cf z%+Q}0=^D$@BeB-ELFzqG9|>K94CVZ`FUKfK#}rTOTD5^ZE<->JRJJ07b@ zmL&bb{$SYoeL&XCAptO(+xI%^E(%JHhuc#T1j=x|ADpMo)ch>p(%w#?{`^USK}Y3N zR{^>9r0`WOu`?;8oEB1{THC~6Wk#xrPq6z;Ey`qyFPFs=cqbqC$$U> zw8b=h|H1`->3mgF=2T5sG=B|0gtS&luj)U3iD<3VvN|~MX%ibD)Tfg z=CuzUUT}*O6 zMw4rHp;cE%(kL&*qSJ9ZWR5*`{+}vxq_Id0f&~zE?eptW^iv)R;<31nH}hO&tHL)X z<61E{LR)4z@IpQuuI@wZui^?;jsMlgUKxB3BYX5M?(*7zo^Iif$JH4`nd7c!IlEpZ z8($hJCBv)CP!EEa#n>cfHpkJyGMC7OM3k^=YxsR_L7Cx8!QLOHpSyFhU#IPE7h<)} z&vcF+nt3;A(no%$)+Ct+$8W@Bqr=w(h|u=J$AXC?LC2UVM+Ggt`;3=rK6tE;TRMKV z|Bz+qvvrOaio>Of7nx|&i1*%nLYtRuVT)ILAaN-#|8$q?_J|Tj`J96gD!GZvRSo=N zGr&qS8FF1SB9#C7`CP1|qLY);3EyYWp|IbJ{k`QNJqho2>Zhm){CO(Iv0&e1TeEzB zMZaV6kLw%zqe7l%`-Vt<#V@_zp!Tna_v^jK6r7WG5ah%1YINjItS63Njl$cS^O0&G zNL&&wZ|guN{=H0yO1+$p@*F}B{4VQ&nHmO(iKqALng+#IEYF&H^C?YWIcX^T786(& z=Za!*m;Z(Z(6N%PcjeV43Rf8SZ67iTn(1gjS|-;9{wvvMIOFp41SGLd^#AMwQPwXC z85{(t6$wk0`~TAI&vd2v!*EP~b_{+rhREn6q>P<&YI9ziRzvP(Q-JD_)`vmd?b&Yz z+QH_sXjMrG`x@s|F79&sS3!U7%cY(+FRPgL=~{nw34B&LzMj5Mcm2iOO$S%|!XflV zj4Wh%lv2`+;g+sg7|&_qvB{C z3>s$HUu>6lU+}UK$^}9rYW6S)$#E&ivcYKiaHw9};Buq`WrySdmN4yO5W*%F2JJh} zfB!sW-}4Fa8QYCvN9GWqAy#%Z(kFuL;jRBOZh;q!B&tZpFCyp7vc&gTJBcwBopCw# z&JQxbRS<2|$KUb7)E<6ets`4@+GwCBPxSn`7mRf0zq{dkbo}}d-@~wa@Y+1YiQczw z$H;?jV`m@3WA%(6Wt^y*b;$%>)Dwu8>dw{{ME*C_G%lcCu?oDm|BF;6YnvzF92QGXXD(llOgGn-hJz>OD z_=<1zouah?*cQ?TT`N~76LdC#>}~$HlE2nyPVX$Qb-jx9QDcLn_k+exVw#X75#%9A z1ol&NeyO)11LX7sA4o({CuJZR7^C=j@7mrg}y?^x_0D zRELZ2SAQbgB+bayRyRA<#O%kn9_qhBG$|8&K5sIuUQ?6@w}iZl z2znfft!R%kaSn2>&G`g9WsCq|1jf;BEL7hG_?LldDzF?127rc}o1n3X`!4`8(ofbi z{?$+Fml+yLWmmKp?m%T*(~h>iZ(d2O*%hULXMTOtIUCUY8JTEth5}5DOCj4z*iI$f1Z@M3uGZSlW zNi=kA4vk4K2H6Oj_IBt?SBnye>yt9P)1PN9V6QWaVYO>?qK1KghVM9RaU0EkXx$DuUPEJv7gKH zQ?l2ZEihX{u0u8})$L-NDheRlfRWdEVR4w2+-!mT9nVLO25zAB%L8*P^IeMr;o@Nw zVsU`ev6k0* z;5-7gPm<%GKH^u8pI(4O81m2_JmzIk-I4V$u;Lf2tkB2esC2)wmcPN+cd1 z_($t#AxupPCrCoNhJVff#C!P$k15AY66vUIw@6kduGb5h7kXt6TjB(G=7spKZYRFo z?M8oTzOFh>s8dFr`KXU9zE3@`LIhy-;Dz;HD*46qwh#M5_;q?Hs zw@CZF9l#W5;l_4^x=#<*Ho5T(UNnQ}23)C9)s&^k1st?aYzhg#95Uy#XUYD`;8y76 z{L$p2t5eYLJ|LUpMx7RgY-UbiPIPk+S7g#Tg3TQ_hCkh&*9MPt?kc@sLXyce;y=hB zrISLD+>J6v5NDl4xOd+hG9{rr9k~fzjCOR<1Lv}Jb?JxtacH4sPm+avz4BL*H z1lVyH$xGv6qMqJHWW6h{eesUZ`1h9Ka}B z>28(ZSME00I{**Z&~~?>14aU`L8JV}5U$m|n_*g2+n{mNgl(r{`1~Ig!%_0mzEa?Y zJU$V(+cLhIC%tUgh-)~&*#2a=G?naj?cu2*#fz^)BpR5?qsQFew#_HUC2r;~T#QWo z`@I-)4Lu%J?{Cq7TUR&Hy%&%63wv_(#YO``+g>{-vXA_BJeFdt+r7uW+N>3-=Imda z-X~uz7(q!|9;ZO?|2fYv?{q!J-gbzfy%xb@=|STP_PU zt50nXF?jUng|o}wmewB8i)JW(tq@K@eK$UsIqEF&KUa~^@~h%3*mVu3cSaN|p9B07 zXPLEm-w90@^ECOH(L~#DzbY2Jr=3{S&aIOrV081Dk$Hc1Dh`pKBjbW!Y7)P5?4Ivv zmBPKfj`KFqcN&x%SiCZQwqYm{Gd0#x`Hb>*L#X7Jjj3J2NrcNWuhQp@fAc@SUK-WJ zXUTXRN_>bib(puHmiJtA*YvMdr?Gg#UoB=H>NSOTGaaRCkTj+-6Y-6)^J&=cbXF3t1Nk7?l^Lqq)YfCVd zgGLBfhDGY>gsxGgK>Z{77o$0{&9tcISK>$P(Kuv;fsPwFaW<|-^e_^D@duS-^6_TS z+#IgBUXQ{1^0yJNVPt6cRPVB<*MSE}M^yJ&#}KvLT)0lY@fmr|^VgZ(S?-4XJRDG( z1~a58iy~UYcXzK4#Pm@cXUS#SSCuk-( zj?SV#lgJ#UKZT6V|MM}9_{SrC0itsBd=`pqKdXXMB_AX(UV7$<3 z%jHMqYdik8_i3D0_qirVJ<6&_Vzd#=OHlPspt?68xmk)A(xQXW-Q3>4sHr<#gxw+8 z?hZ9}{<(rgZq3Ewj}WC{1coYHeMJs&o9oL0`q-5eW4W(Er5{B!kJGFFKFJPd{AB=f z0j6X6wihNqofyM!UhMdEN@UC9dw4uhjtW6Yeat%}L=(ojrqK~Jqk1|K!Q^-BM0N|e9qW$4ib~k}KkPBa?LqYBf4vYj zj8S}i&`#vRuiyh8vQ&hKLOt=uf!yg+G6)`SPiGd9ImtkCF9@R|jdiEzYb2PxE?fV0 zmzem9z)pWTM(j?W&W$-G6d_qA4_+hG1EAeqB1k~z6rZ-^xNy#Ge6?NP?g@mn)W!L( z5xQNofRbj>X&@WYaXpw{~iSkYIZ$gTZdkqb_-lvn}JU)hzu<@h}g<- z1cnE8f9)#uy{v}$v1Ul6G>d>C(^0(<_$Mp4Gl+rjSy(TlA7y?-bE*W7+O`WNCvSEP z`V0-}rrk_(ypR&`CVbzW6Dz(UtTK9RcP4RCPDKCUq%d~&I^C;cr>jbpr;t!(Xa|}} zX-{z=j)SM>8@Ngin-tk6y}8z@j)b)W(NRzI`+<}BKv=kTMdP?bPfyR(0#HWNrRx2# z%60HH&a6!DR7ay}Z@(>UUW&Z6qwPG#4$lPr#>+askJ*oY#C9to9j_9Epx<6i<3l17lQ z2pbw2=6V=ib}a^n#4*H$>pRT_JF=?{2`9BCsJj@J|k zV^#yY#t!{Ksr?`Hheu)WzO3XDYB|H&;?-7`L5?>Dyr`017TB=pIzjHE@6CSVTJ-ha zH)mmd!K<~f_vyoPS3z~-49?C?plo`cF(IIz^}&sUsT%`Hl&5{VZF4e5P3_CTYssV6D&G)7f`p+`}Ht=Y_W!Ex2Hd+J#_Yi5toCd?$b-fgdc;LnRBx;SEv>t_~g9oXiXA5 zmBU?-?__N0A*zma`_6qBKKwXUIb+ic`B== ziGf+^yTOZ9G0(y8cIw>T!71ZLp`jDqCiIiHn$OXeY-UMsn$^fQ9l4OQZZ303hmILz zHgKaSPenG9lW^idMn1+tbDXlOmE9{WC$;w4<+s&2MU>csEVIDQprc{2G;=%@^SS&4 zIG3?578th?M9>GEklEsE@2}6}SQe-u*xjYfMC59_rOy3&ij2AY;0(+l2FA*J91K1H zCFpPh>BsDnA90P3=c>@#hUNE^Y7&GfX^O?>*6kuakGpMyt}fbpdwr;h!`JVryNvHp z-HPAf#@1T$Lw9`GkYw={BZmk`9&>pG!NUqKN6P*Hdd7;`oz1szw(U#4;)-Cq8q%1a zQQ%ZRIJwoW^)4C=Z~qKt0cHm=-hI05sbBxiz8k)Ly1e3KuxxncR~xlQx7L5?GrFer znTcZIkh>(2!buq3|71S5M5=|EhKIQHErRO|;sR&+wy?I*eI3sP>bO)-%!*mx*d3`d zJxt7dd)swz${PdUiKCBc;;(3B)UBRa*c3ArNWH^B>`dNT1;>=~J8M#1v|PloNt_fRQTTiqwAXp`jj3%{u6s}?ZEz^Gr$f>R=+c}NV{ zlSSuV8jz0Oz)^tGNlBRZaS>E)ue-mm;PEH+$az3?JEH-&-0CR3+sYp>r&Cvx9P?9Y zvx9pX)ZK*SB#c)(OTh-as-*Gnq=$er?CpEcR>9Iv9;vaivttv`5rm6)+6 zj(D-_zcw7#RG_YZls-CJpipdKLDcX$Zu^P*kf^0Hp8b{=4Mv!6maB+w1Eb)`xho)+yF zuX=BmD@<&>n9}^Zj6M+0(L^Zp_jfrJm=OB%Y@+$yuvd@iX+IycEMNF2^ZJ3K{^AX3DG3$${{g_uKYTV+-qLg&@}Wi%UOe&MS~cLid?A288%U{HVI?ih%4BS%v(GF|fBID7 z4zgH2GtLHo=KRBRpY69IG#z%C$Hga3X=5LDUoa0Va1w@g9e3p7$=B ze$4JELF81ruG}7@vC5UkLG!Dd1HQ{7<98ro?)ARx%JjzGbIKu%uKCk}1yMc+P+%xp z&M#-Fz!Zv`Cp(!somD0rGG!1zi%K+SZ~OF;azY}MxA}~|RHa-BxuE6REi^Wikdn6P zdA>@RPe=>@3WzPkw~uaZrW(I6wth(Vh!$6uYRN(@ZGxG% z0kW$1|N6sgS!NX~4Sw2Fvp&ObDuZ0XYdpOzu6X=#?)L?8!s7)X)w~i`5H!2lV_j0} z0@cSm;{5yN1u$@nLy9lPE^+zy76qZXmo@dxZa)uR|G@*ukiP73T#rhAoOJnUz0eyy zwT=~MCQ{2Y({fdzP$gD)ojTNosOADY_!vSd!pcypZjDz09TAW*oUW0^d+o( zV^br1^jQ?E=-W&WjHAHNIf(&ofP=L8+l(Pj4>_)`i26C<=s$J5(&;Q%eii9U=}Kvm zLE3-lawbL8NFI6-wBfp@pM2xniV3U8>>SHUIQK9rP^#%%1PKXR?X0rmrK06bXl15^ z8bG7X4?ap_JvVf3b<7-M$1rp3@q>>=U1Qyee5?mPrYTpfz-015oOK=A)!>tb?KY6S zpV>1h*K`gSYm$maLLU4P2%ys`QdeRk6qbV?F@>Lk{~@j9rpnk z-w}3^841^%l1}=wTh2VwS_G_WbLywEhE1*IgIqIkJf$L+CKF(@%JA=789w|u+gu(Y z*CJ+uNFs$UyZ^SXz9|n9JK%gzF)yPG$nUz}`4C$VpwqqYKi|I=jHH+T=q#Ad=^c5Y z7(&sFzruomnBE&B?1&*@$;N#17@(sehHaG~QFUV$8PpptH0v)0*0TQ6F}ma`x&<0;q1hG2bRy!`GpW;tf&@uInL+h`_Ssik>u@-t4qu9=fo6IeDM}=`Chj=>VSjVqqROD z_I9pmzN^0>m((j@AVYTMG>QN$e%gmTcAR_Y_=m{^@ie)Gjszv1-peH+*vaA`21~Tk z`_kJVOJf*P*(~1xq-*pBv0(=aY!KDrijrdyp1)4_6J{ePwU=I5Nco5fJ2A5hbLl3Y zW-dUK`03$_f5PJIu{{T<@ejzlX(;a+G|e7rAL(wbrf(kZwM<_6Y+Bre8^z?ABZKwiSIMWkG7}u_=Z-D)4Q-uly$VEEecGdHOzccJA+vJH zpFUR^oarW4%As{`iesa_gxSD!LsfoBMU>I*24YBOY8*x(n|`jmvnl~|Dx_|ibb5Vz z>&G_tA#BmA^4qx?w;zMDIKHh2GBpK%R8soRCU~660L4mV=`y4=r2K%akP^?fx0J4d z$5>|l+zwGOY6Yg7Z%*mIwm0atSS=KsELLPBZUg&Ry^6CXQA%fEYoEp5bZNnlm>E#| zoKJQbDAlDpp^;K6jkXgOOv~9YqEcd`WmR)0Z9}LEqL%)7v$bw0J>9@F1_2R?o>@Y8 znhQ3p90g>eQ+T!eId6%<0+(bl1(NRQX3@H1Y`h#D76oTmF<;%Q@G4kN&`l0qqUCN} zyM~<{rflB)K2WxEoG;a=)2IR*ue$_7dIz#LTFPq%54RW7{~;wTw&m{q9yKK6w1fsZ zUlgKX!^6GCox+)LSmItnzZdiccfk}-6G16K&E334bs@}#BUaZ9RO#wI#;Ci{={Cvk z0aHDZw~LYeV7{d;Su49o??$sUZF&*UFwO?oM##&9ecPYwJ_B|}skqj(mTrnkf^IGN zB~K|z_5xQkriI9-e8;fg6)_8#*2cBqHC!a_ zWCS~v@M%mcP-&wO?}|-oPL_qMRIUc#(OO71?<;C@ni!R!5TKz00V|ywgme2$!e$*0 zB+sgjA$d4R>Cx15-S9CYgE^s!Ts-$Od;q&$xw~5JvPhB9MjQ8K>|~Bo-?K+4`#@I9(`Ip- z6*w~-f;j)fkpuPh!fgizX}boX7tt=^t?Fi3sz)zYf*-!BM$}8;wJjx-~sZKO(}f%^L~xOEQKNylUYZSbU8M$ zaKu~*HX(W4%4!a|eVQNS0D`!S7^bYny1W5CT&Y z=Z?4!1>5uuGB&jwnnbcS5NmfPau9Ht;7Zyw??B}$fy4wkC240^v(a_-+r)<=>uT@L zx=L0kSJ>{Z++vVPvBgey;U$}~Occ}vd(3@>fU5g;dzF8XV@*=XuqKx|rZ7_n7_Sxg zb5zT-F#nHk|tnA>PVYQmWIW~t(;DcRq&g=S1Gx4>BJ6<^g z)1i6Y0L(pwfTb^aMpw3mKZl+!b7zj=-+$#io<^X^jh}IvYSVX`PH?ak#p-fT)R0yz zu#q;7H-n*rLeG8u0=USY7@I?}dO8_&-BEkPrkqhkjhS?>avt#&C%0dqTIROf%KW)l zdHcgz)T1*#jEf%PI_gc)7Recjt~~x zLDjym+IrRP^eCZHDA|?e=q5DGm-<}={*Y0=D~1nizQ?gEd&8%w5}6av$7steS=GGTV8m%Kg|$7RPg4-O8u|;g=SM z_hY_DlkM9>BSCMF+AgwkY2+MLzufS53))Oup{FS@&OU^4Q8uZzz9g9w!_b9eAGA2=%?CKTzI&&2$@+5y#TH%{*>=$lPZSHnVx0Qyd@VXvjv6iA<@= zx?hHiQc>E%XJBM+qUV8`(S&*?2x?RpYozM+;oY;sar$M?GrKP;nPWuXSszS?qmF`- zyIkO@Kvq7yeGuF{@__TWKmsSJ{vnolW!BisZ%X$!3!89<4?FC~(rDx;TGb|z*U-CF zwbZuwhDmeO4YOH1bVi}21zRN@UvjlM#nWvD7p3&QC%KP_0Jb!lSZ}hJb+B|*FquZl z=0)BR3I|jMMTFJe0XmMK?3y{tTs?d&P7)JRBm$to)~!>Nmi99N1~UOx znWTvZm3o&Lr*EL9@SQBQr*cY-X`|wAeW0LYLy4Z05N;e7uXyWz3)hm3%a2e{1(kMd zpzSyZM8?khk#MG0U6nEGAfG{d_7e7?6dAm6)HJ-DNB5aXaIm1U!rw}y4AL*iZv-C@ zYs>4nlcn~Iq}R7^pCFOan~9x|CAUbFErdS#{loVT<`yOQmzZPMRt#qKiU@*EB!C-fVXSc0B0-3H&;%qNSXrblLa+ E0g(vl0RR91 literal 0 HcmV?d00001 diff --git a/docs/topic_guides/blocking_model_training.md b/docs/topic_guides/blocking_model_training.md new file mode 100644 index 0000000000..daad2a331f --- /dev/null +++ b/docs/topic_guides/blocking_model_training.md @@ -0,0 +1,111 @@ +# Blocking for Model Training + +## The purpose of the `blocking_rule` parameter on `estimate_parameters_using_expectation_maximisation` + +The purpose of this blocking rule is to reduce the number of pairwise generated to a computationally-tractable number to enable the expectation maximisation algorithm to work. + +The expectation maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches. It works less effectively if there are very few examples of either matches or non-matches. It works less efficiently if there is a huge imbalance between the two (e.g. a billion non matches and only a hundred matches). + +It does not matter if this blocking rule excludes some true matches - it just needs to generate examples of matches and non matches. + +Since they serve different purposes, the blocking rules most appropriate to use with `blocking_rules_to_generate_predictions` will often be different to those for `estimate_parameters_using_expectation_maximisation`, but it is also common for the same rule to be used in both places. + +## Using Training Blocking Rules in Splink + + +What is the difference between the list of `blocking_rules_to_generate_predictions` specifed in the Splink settings dictionary, and the blocking rule that must be provided as an argument to `estimate_parameters_using_expectation_maximisation`? + +These two kinds of blocking rules can be seen in the following code snippet: + +=== ":simple-duckdb: DuckDB" + ```python + import splink.duckdb.comparison_library as cl + + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", + "l.dob = r.dob", + ], + "comparisons": [ + cl.levenshtein_at_thresholds("first_name", 2), + cl.exact_match("surname"), + cl.exact_match("dob"), + cl.exact_match("city", term_frequency_adjustments=True), + cl.exact_match("email"), + ], + } + + + linker = DuckDBLinker(df, settings) + linker.estimate_u_using_random_sampling(max_pairs=1e6) + + blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + ``` +=== ":simple-apachespark: Spark" + ```python + import splink.spark.comparison_library as cl + + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", + "l.dob = r.dob", + ], + "comparisons": [ + cl.levenshtein_at_thresholds("first_name", 2), + cl.exact_match("surname"), + cl.exact_match("dob"), + cl.exact_match("city", term_frequency_adjustments=True), + cl.exact_match("email"), + ], + } + + + linker = SparkLinker(df, settings) + linker.estimate_u_using_random_sampling(max_pairs=1e6) + + blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + ``` +=== ":simple-amazonaws: Athena" + ```python + import splink.athena.comparison_library as cl + + settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", + "l.dob = r.dob", + ], + "comparisons": [ + cl.levenshtein_at_thresholds("first_name", 2), + cl.exact_match("surname"), + cl.exact_match("dob"), + cl.exact_match("city", term_frequency_adjustments=True), + cl.exact_match("email"), + ], + } + + + linker = AthenaLinker(df, settings) + linker.estimate_u_using_random_sampling(max_pairs=1e6) + + blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" + linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + + ``` + +The answer is that they serve different purposes. \ No newline at end of file diff --git a/docs/topic_guides/blocking_predictions.md b/docs/topic_guides/blocking_predictions.md new file mode 100644 index 0000000000..3fe2c51bb6 --- /dev/null +++ b/docs/topic_guides/blocking_predictions.md @@ -0,0 +1,49 @@ +# Blocking Rules for Splink Predictions + +The purpose of these blocking rules is to try and ensure that pairwise record comparisons are generated for all true matches. + +## Using Prediction Blocking Rules in Splink + +Blocking Rules for Prediction are defined through the `blocking_rules_to_generate_predictions` parameter in the Settings dictionary of a model. For example: + +``` py hl_lines="3-6" +settings = { + "link_type": "dedupe_only", + "blocking_rules_to_generate_predictions": [ + "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", + "l.dob = r.dob", + ], + "comparisons": [ + cl.levenshtein_at_thresholds("first_name", 2), + cl.exact_match("surname"), + cl.exact_match("dob"), + cl.exact_match("city", term_frequency_adjustments=True), + cl.exact_match("email"), + ], + } +``` + +will generate comparisons for all true matches where names match. But it would miss a true match where there was a typo in (say) the first name. + +In general, it is usually impossible to find a single rule which both: + +- Reduces the number of comparisons generated to a computatally tractable number + +- Ensures comparisons are generated for all true matches + +This is why `blocking_rules_to_generate_predictions` is a list. Suppose we also block on `postcode`: + +```python +settings = { + "blocking_rules_to_generate_predictions" [ + "l.first_name = r.first_name and l.surname = r.surname", + "l.postcode = r.postcode" + ] +} +``` + +We will now generate a pairwise comparison for the record where there was a typo in the first name, so long as there isn't also a difference in the postcode. + +By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes implausible that a truly matching record would not be captured by at least one of the rules. + +Note that Splink automatically deduplicates the record comparisons it generates. So, in the example above, the `"l.postcode = r.postcode"` blocking rule generates only records comparisons that were not already captured by the `first_name` and `surname` rule. \ No newline at end of file diff --git a/docs/topic_guides/blocking_rules.md b/docs/topic_guides/blocking_rules.md index eaa72463cf..ba947d0cc4 100644 --- a/docs/topic_guides/blocking_rules.md +++ b/docs/topic_guides/blocking_rules.md @@ -2,165 +2,57 @@ tags: - Blocking - Performance - - Model Training - - M Probability - - Expectation Maximisation --- -# Difference between `blocking_rules_to_generate_predictions` vs blocking rules for estimation +# The Challenges of Record Linkage -What is the difference between the list of `blocking_rules_to_generate_predictions` specifed in the Splink settings dictionary, and the blocking rule that must be provided as an argument to `estimate_parameters_using_expectation_maximisation`? +One of the main challenges to overcome in record linkage is the **scale** of the problem. -These two kinds of blocking rules can be seen in the following code snippet: +The number of pairs of records to compare grows using the formula $\frac{n\left(n-1\right)}2$, i.e. with (approximately) the square of the number of records, as shown in the following chart: -=== ":simple-duckdb: DuckDB" - ```python - import splink.duckdb.comparison_library as cl - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } +![](../img/blocking/pairwise_comparisons.png) - linker = DuckDBLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) +For example, a dataset of 1 million input records would generate around 500 billion pairwise record comparisons. - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) +So, when datasets get bigger the amount of computational resource gets extremely large (and costly). In reality, we try and reduce the amount of computation required using **blocking**. - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) +## Blocking - ``` -=== ":simple-apachespark: Spark" - ```python - import splink.spark.comparison_library as cl +Blocking is a technique for reducing the number of record pairs that are considered by a model. - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } +Considering a dataset of 1 million records, comparing each record against all of the other records in the dataset generates ~500 billion pairwise comparisons. However, we know the vast majority of these record comparisons won't be matches, so processing the full ~500 billion comparisons would be largely pointless (as well as costly and time-consuming). +Instead, we can define a subset of potential comparisons using **Blocking Rules**. These are rules that define "blocks" of comparisons that should be considered. For example, the blocking rule: - linker = SparkLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) + `"l.first_name = r.first_name and l.surname = r.surname"` + + will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + Within a Splink model, you can specify multiple "blocks" through multiple Blocking Rules to ensure all potential matches are considered. - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) +???+ "Further Reading" - ``` -=== ":simple-amazonaws: Athena" - ```python - import splink.athena.comparison_library as cl + For more information on blocking, please refer to XXXX - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } +### Choosing Blocking Rules + The blocking process is a compromise between the amount of **compuational resource** used when comparing records and **capturing all true matches**. - linker = AthenaLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) + Even after blocking, the number of comparisons generated is usually much higher than the number of input records - often between 10 and 1,000 times higher. As a result, the performance of Splink is heavily influenced by the number of comparisons generated by the blocking rules, rather than the number of input records. - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) + Getting the balance right between compuational resource and capturing matches can be tricky, and is largely dependent on the specific datasets and use case of the linkage. In general, we recommend a strategy of starting with strict blocking rules, and gradually loosening them. Sticking to less than 10 million comparisons is a good place to start, before scaling jobs up to 100s of millions (:simple-duckdb: DuckDB on a laptop), or sometimes billions (:simple-apachespark: Spark or :simple-amazonaws: Athena). + + Guidance for choosing Blocking Rules can be found in the two [Blocking in Splink](#blocking-in-splink) topic guides. - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - ``` +## Blocking in Splink -The answer is that they serve different purposes. +There are two areas in Splink where blocking is used: -## What is a blocking rule? +- [Training a Splink model](./blocking_model_training.md) +- [Making Predictions from a Splink model](./blocking_predictions.md) -Blocking rules are needed because it is usually computationally intractable to compare every record with every other. +each of which is described in their own, dedicated topic guide. -A blocking rule specifies a constraint on how Splink generates pairwise record comparisons, dramatically reducing the total number of comparisons generated. - -For example, the blocking rule `"l.first_name = r.first_name and l.surname = r.surname"` will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. - -## The purpose of `blocking_rules_to_generate_predictions` - -`blocking_rules_to_generate_predictions` are used by Splink when the user called `linker.predict()`. - -The purpose of these blocking rules is to try and ensure that pairwise record comparisons are generated for all true matches. - -For example, - -```python -settings = { - "blocking_rules_to_generate_predictions" [ - "l.first_name = r.first_name and l.surname = r.surname" - ] -} -``` - -will generate comparisons for all true matches where names match. But it would miss a true match where there was a typo in (say) the first name. - -In general, it is usually impossible to find a single rule which both: - -- Reduces the number of comparisons generated to a computatally tractable number - -- Ensures comparisons are generated for all true matches - -This is why `blocking_rules_to_generate_predictions` is a list. Suppose we also block on `postcode`: - -```python -settings = { - "blocking_rules_to_generate_predictions" [ - "l.first_name = r.first_name and l.surname = r.surname", - "l.postcode = r.postcode" - ] -} -``` - -We will now generate a pairwise comparison for the record where there was a typo in the first name, so long as there isn't also a difference in the postcode. - -By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes implausible that a truly matching record would not be captured by at least one of the rules. - -Note that Splink automatically deduplicates the record comparisons it generates. So, in the example above, the `"l.postcode = r.postcode"` blocking rule generates only records comparisons that were not already captured by the `first_name` and `surname` rule. - -## The purpose of the `blocking_rule` parameter on `estimate_parameters_using_expectation_maximisation` - -The purpose of this blocking rule is to reduce the number of pairwise generated to a computationally-tractable number to enable the expectation maximisation algorithm to work. - -The expectation maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches. It works less effectively if there are very few examples of either matches or non-matches. It works less efficiently if there is a huge imbalance between the two (e.g. a billion non matches and only a hundred matches). - -It does not matter if this blocking rule excludes some true matches - it just needs to generate examples of matches and non matches. - -Since they serve different purposes, the blocking rules most appropriate to use with `blocking_rules_to_generate_predictions` will often be different to those for `estimate_parameters_using_expectation_maximisation`, but it is also common for the same rule to be used in both places. diff --git a/docs/topic_guides/drivers_of_performance.md b/docs/topic_guides/drivers_of_performance.md index dbdf4d3c83..a53f3f795f 100644 --- a/docs/topic_guides/drivers_of_performance.md +++ b/docs/topic_guides/drivers_of_performance.md @@ -18,6 +18,8 @@ Additional factors which affect performance are: ### Blocking rules +Blocking rules are the primary method for managing + In most large datasets, it is computationally intractable to compare every row with every other row. The number of comparisons grows with the square of the number of input records, using the formula $\frac{n\left(n-1\right)}2$ . For instance, a million input records implies around 500bn comparisons. diff --git a/docs/topic_guides/settings.md b/docs/topic_guides/settings.md index 8f2b9531af..84176a797f 100644 --- a/docs/topic_guides/settings.md +++ b/docs/topic_guides/settings.md @@ -57,7 +57,7 @@ The `"link_type"` is defined as a deduplication for a single dataset. **2. Pairs of records to consider** -The `"blocking_rules_to_generate_predictions"` define a subset of pairs of records for the model to be trained on where there is a match on `"first_name"` or `"surname"`. +The `"blocking_rules_to_generate_predictions"` define a subset of pairs of records for the model to be conder when making predictions. In this case, where there is a match on `"first_name"` or `"surname"`. ```py linenums="6" "blocking_rules_to_generate_predictions": [ @@ -66,6 +66,8 @@ The `"blocking_rules_to_generate_predictions"` define a subset of pairs of recor ], ``` +For more information on how blocking is used in Splink, see the [dedicated topic guide](./blocking_rules.md). + **3. Features to consider, and how they should be compared** The `"comparisons"` define the features to be compared between records: `"first_name"`, `"surname"`, `"dob"`, `"city"` and `"email"`. diff --git a/mkdocs.yml b/mkdocs.yml index 18de0df21d..bbead01669 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -121,8 +121,9 @@ nav: - Data Preparation: - Feature Engineering: "topic_guides/feature_engineering.md" - Blocking: - - Run times, performance and linking large data: "topic_guides/drivers_of_performance.md" - - Blocking rules for prediction vs estimation: "topic_guides/blocking_rules.md" + - What are Blocking Rules?: "topic_guides/blocking_rules.md" + - Model Training Blocking Rules: "topic_guides/blocking_model_training.md" + - Prediction Blocking Rules: "topic_guides/blocking_predictions.md" - Comparing Records: - Defining and customising comparisons: "topic_guides/customising_comparisons.ipynb" - Out-of-the-box comparisons: "topic_guides/comparison_templates.ipynb" @@ -132,6 +133,7 @@ nav: - Phonetic transformations: "topic_guides/phonetic.md" - Term-Frequency adjustments: "topic_guides/term-frequency.md" - Performance: + - Run times, performance and linking large data: "topic_guides/drivers_of_performance.md" - Optimising Spark performance: "topic_guides/optimising_spark.md" - Salting blocking rules: "topic_guides/salting.md" - Documentation: From 3fb67f632753df7db836e4727921565ef54219ed Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Mon, 3 Jul 2023 15:37:56 +0100 Subject: [PATCH 02/23] start feleshing out prediction blocking guide --- docs/img/blocking/cumulative_comparisons.png | Bin 0 -> 20730 bytes docs/topic_guides/blocking_predictions.md | 82 ++++++++++++++----- docs/topic_guides/topic_guides_index.md | 8 +- 3 files changed, 67 insertions(+), 23 deletions(-) create mode 100644 docs/img/blocking/cumulative_comparisons.png diff --git a/docs/img/blocking/cumulative_comparisons.png b/docs/img/blocking/cumulative_comparisons.png new file mode 100644 index 0000000000000000000000000000000000000000..a9438c864fd1d463756ad6d9985787657c8ae842 GIT binary patch literal 20730 zcmZUbbzD?i+xI<+64EN&9SYLY(kUs7!cfvBFmy{J-AD@(%D~VubUWk_(j_o}bc6J} z=iKLhj?eS{f@R<9#e0%8=|CF0D)?*Mn;w z5tksW)WgIe6PzH-n{3wiID)}kEte^2Ksi|UcGCOy=JI%M?^#*U)F}jK!~!zXpe%0#AOJ>pd9*-PP z)U^n_65Xlp-21BTezRqCtBjPmwDTIvSa{9Z&RN!!;3LU<@uF8dt1{0)Tb8Z5`8@^IxFjv|75*{RvC7%?9Dj-HbELnTgJ-|~=rQm$2@Fv-M$KK2 z`g@+sA6ZA6&lWN!mwk=~Qw>jwL3nl0dhUYb!+vg|?sm~_gI#mmq=GcAjbrLeE%@!t zS*xY5;L!k|x-}?q-l#|8T23@;bV{kr@hhcI9F)k2WW+=WYt9)dZoO4nr`Gzrn>+OV z^P*=u_;Trw-aC4EJ>aj{58N!U3*6SUI zjX?)L=bPk%Y>(xS&vH)M3H+WL_@0b!r0Bm_QsIK5%qDgHZ~KjIcz4P>@i#u)!&kiA zZFFc@i4h%rB{ZJ|Tn4ZHl^0oP8Y}LWut;DVN4-!zB2QTWJeiPz`@E}l_Y+X&HW!X( z-MpI$YmU$1@Lr0Z&9|sUhRCO*OWGIiZ@DYB-I5TSYZ(sN72Ni_@O3rTOh;)w(`bQ) zo%cV=TDL3vM~!X{Q>%9ywhG@`X4OE~uh%7S$!CHE{Ptt&TX$W{Y|ka!F3M94z1V?g za;ZsG-}rvYH}t~v0=OdfYN$Zzzj@2W)D*fz6RzwF~z`b zy5uKP^7cyYeHx0Qa!BHG`|XW`&kCjWtbLvR%=dT&nlv}yPjVN@k8t@0G%lOoBc7Bk zPFv<=P3YN{UKNRLeKL*d&BsxK-n489E&n;!gCyhkj3@=e{BL;Uy8ptNe6 zNwD)PuiMMT#aF3c{C2)eoL0Du$dQh}r_;k8jsPDlABNV>Y`cim`vcc(jF z3uVjF9^_|wZGOvg8$WK4)@Yyy_0+7T+gflR_@2!R$rk2#@4{6%6e7UW)urVfIOpQx zfL*gD`hTZ(SLJTnmloNrUy%o^lR`pNhXf|Uy<&8!$E7*+Yddf^dr_d87V3=5{>QCz%Q6Jn+8XHEqtv|NmrSyZ0&2(|D$swneW<@`8lTQw-y-#2)D zA- zJG<08qd)#mfHj4{A|5^uBJ7d)%NY>?t-13e|l(tNZ%C6e-grKDRP!F&!Pja0*^ zAd3CxYss_LMOlY27rm@Gs*=|m>Q$C|ZTAQUoFUgIlSKuoh9aCk+0t9fr?U>~`gRp^ zA}djRqn!kjnb9ai@15jm7Oi*B-sroX?*aRUkA57}(^I3@XIuwlZYQ}PK%g2}AmDg) z@Vy9aX2MUS9(Ojybv4v7!uhkxw@er zisWwb&@)fgIc)UlB2qwk_jZ1u_4~baKL0gw@Xbbqga761$Fng#{xCg8o^J_Ir0=e? zCyiygb)i_9nD^qvL|vtOa!a`cf4%sA+dWU8J&*6Ap`S|=IA!w)OF2%(geFP{r?u6^ zlf`=$?p@oK({rj5;{Fmv7VP1`MYhX355f9Dj@3Ps`UhYgp+Lc>3l1+v)t}ojfl4Pn zTkg+!w+KtOO#;rUx|by*UmoH*s>Eu3E%=?v83ljiy;I{X5{MUe^18^f)O>J`oG}WR z^wwNYCkInb)TX86<#e0^Y!hBZQhKD@(5gu~BD~=0OA+KWYO1|FZ)^TL5R4Z|()Efl z;3Ykkp)yWEb>35U8Z+yF`y_}y(~54A{s-IY{ukPUD8>7oKjq|T*qD08)G#XuA*kkx z&e7veWM~pMpQ|y}r7;ZD$B#vn2)F-bD{jNZw7Jp#qy#T}<&`puT|eZ7guUenRxWxyPR)J?Qm#%+Up zh?rk&3ElT7TJEYIw$A1YFd0J$;7%M{vgx_9F@v*GyD3~~9iRf9`|X&MeWzb&udTvM z;wIN}t3@EIXW7j^10Uj-n#g7pO*YzP@zl#%#!a_E;o>?F^H)0l-yhmOW28o8-xwRO z&or+RY#SxNx6Ve7z`emZI^d%JV6zqk+3?u*tyheAUxn^p##RI4wxz+8l3G zswcNS-y_-HF1~gdE-hO^?ZNO7%SzyBcv7vNDAuckOkLNAn{2>^^`_X;gUYMprR}D% zF-m`_Wqbo{r8rV-OfR1sA+_#h^h7#@f66!cZ7}l z{mKmsO=6KlCYsi%_Z~#G1@SCxeD)apiy+va533aMB$k2EIu%)uYuOU7DjzX-*k{Zr z+fjZ%sBzV#GLh)2gLsk_&uII>gyV>SGMM@lckcnEU0gSB8Px!=wYlaWu(I*QSr zA@r4PjqPL`kKRJ{MX^5BXx0hppBP;K>J-&kNbPPc(3KzmLuLb!eCsl9&X zdd4jEei`B%;45}mP&0J5{SrKJm1CJG6qeegiFufhj;J{URXfwJ!XQ5dNiWg@ir2l# z?yzx+e0_WTrC#Tq>lfF@W6u6qNy4S?-!(A`!u9wdI&~@bzdcwt2;VUN4k@W&07ZXd zG<9J>Itrm8!e)+s;iDE1r~Kso!%#VIOfB@;L>(%0^A9NQqw#&jG*o|ok#s4GQi^odc&W<6TtnBQ>9wv zNe$&Vj02G&{L?4AH*WsSmQarqu`nrzlS?3F38XhGb}H45D_^S_L2w=y^UQRpG>JuJ zcz*ss{v^L{MuZ+D7P(fz(;6)_qIQVq;30D=Tbu6CsEbn>XaDd*KbS15!eCRo1Z_cOD+!B>bNVc+OT@_#M zr&27H41RlbY@Kx@{koaY{ELpSA@)OEf8CuTa(JP<2AcfjvUx>jvxdDZ^)sov z*o0aJnbgT`FupS6pFGIc*pEgriTk$l<>Z zrP;AFrst=x8AP9d9zY3KU-$y16v)0x!Db-(D^K>}lAUTL5F}}^%b_&B}V1#V-Or15Hds%fS@A5{^_=anl$Zat5M ze}1efJXt?;J@jxemf^})GjdCKh1Ga9=M7fmo{p%KMv#s0VT$PvBA!SOD+{8g8|fqw z7o7Ps&U&6OGV{Z!v~fihbFJf~O=9xC_w9dF!xhJgnwU#^ zP6zoZLiy5p>^JNQL}$X54>JDASOuh&*Z&G_@8iuwA$aj6xzix>K^AtW73JP}V`&hh&ojgi}B57Ez`Fu=y^fF=uFz5x@-VwL{ zCO)@#vu7stx+YSx5%e7SuRUT8&B~WdN)Q&%^R;+#eE2)Ms97H`noM$^rB}|8zp>w@ zl(ib;E0~rrf;tqH4af*Gd+{u<6BBc7YTO8GM0icdhKn7xYG#T(y?gHIA8D0y$rBCv zJ$SgMsDxu_<>8{8D?y$eaBXq|4_f=oY;@k5lvQkhs9WwM|uyeHL$$P8lg`+_GacA>rA+J3IGwvI)0Y%}U zvgtdA#U1ir?h$gC6?v2b)@iJ99Drwa@Tnc&q$GiJt6`i%w)mI&3pjp2<;Le!k5ZMP z5HOxxGLam$;YFNrMV zR8qP*&_CUmO(7rFb%n7BFu^bE>NBbm`bTV9i1b4n$PT1qRNm|liS7+t^RIz*9-Xaz zPm`2a$4~Ea8$=Sh*=phwbCY&VM}eO_CCAJE(%`rA)zf~aG%I1)5FKpiw+*U82fwtc z$T*f@);~;~!L2Wh!#R53?NOVExr7Q}%b-NEWwUvzba?4NbXt7OO9b}d-YQsWMFU8F zjOYVc;w${j8jA&fq*JUeXuWSL_)DMs{f|MeJoyve=W#ua99@f3Gvn||qU9$C5@XL6 zobo?Oa2Qth@IF0GZpyLP)1%KX?FfwK9W@GPjm-wU&BChgMWoP>s6m$YUJ%}sxq)wG z=A}e?1({vTC(mfJu{KhIPaA6NOjolGxU4UN{SDwzz*wGbXrY{1GuerPft0FZDESqv zIy~X-O9y@R8=7bmlnuW|v_zx)2Z-sEhrIIF!kLe$i9&5dBl!7Fm>-{8b$%mJ6>c=r zdfxq`L~}Pg_BiR?X9I%DK|JZFhazmH7NXzWS{=F~&t$%i=U7AsE?0bj`xL!Tr+!+p zxoTJgLAtJ=VP#bb8Rz44}h4=_V&30`v=%5~%%}{v-gyXJl-Gh)Y6P25h z7d(F{1;!H46B^R+8Tu|L(|l#Op8s`|2P(nzvSd>4v!arTgR-ZM*9!s4+XIRK4mic} z-GX~~s27i59^tr96lHW3J^l|z6zL(2?YI_h!BziDY*FbB3^)Zn-IokN`y&&>2o?(N z4u0l=mv{dz;r>W4VWkhFNrlyy*blMZ!Lv(T7j3r8M)pm$bf6Vi3sdmFlHV{XpbVbe zKlQNs9|;jF3Iu!>J((8mAs4ZrAz!QtZukiJ3dm~KI3|nWzP`f9$C|5AO$S81G>|_N2W$uY|$Fx zXG5$v5ks$fQ_Wlhe^*bfUz837Zdnm_)+@HF=kTQBOk}w&MRE!_{(w>tyvi%(Se{>g zK4VI|d*#BBYJ*aKz@NhN>-a_ruBJp%Amp<;@#-k8C6@XJ3@Uj2bt6Bi_p?9&b&@E1 z&D6!SoCLfUvuBp#xcQ8wJFGfag2oRh&$hu z#YSH}V>0CHX7ZjVWXi9c9(h!1M(%*Z$rVJpTI!#A8TK88ib`hni5eoFB$V}N3nS{s z4r1jX1=%U-S_O#rMW)oHTaTr6>W>HbklPnTVqu%<(+G}g5gPqC=)nspPhkw92K@}a zPQW0<##=ZD=AzNUuPv)BCH1642Zx0BSh%k0^hddqYD~_NBYW*vBe^X$*+#w^gYEDq zgEECbusNR0KB;_NqLr&*$ZMjMk3>>#m47wKPp{!X;zoL>fFZr$CwjSB5ivs2H5cY7 zZ&_~VKq41fXAzoCTwfRxLcr`pzt&^E3_q4VR$w-lpE~*>cW53f{Q-y7t2QqQiviNn zS@b4P#5lTi`iL62{PUgBQ9Pv~2McG%uYL(R;c*+t(`e_}$9((`2H)D7tvfM<`TeM1 zD7rB&{9T+SkdTL9@X>M8bxG3HD9}H#>83DPfBsH3(!K3~r=z_Xd6|w27raGjAra0e zOTtaGYZ0o_1KDMP^%K256TNSlLB${R6>Ji)dlNc`Xcs2Z1EX5}jv5-^C+%KwQ_~kT{OFvlFv?J=ru?~0Hs5Y%wO4Gz}tvG!)C)BGB5@vli#^yMG*uY-T z)>_P&Hcu?Q+o4PiY{dUro9zkB@Pr+gH8glXCYM`yEb6*VqV7&!#=S_hcxyh+)htA;%dnxi%>$V-JodEuosxH{+s7=vjxlc6zjZAlChh_Pyf$sJIt> z=}W=)wKmUJe-nnV_lAD>cD!SFn0n7@Cqf6T;1|#Qq^h?=zxOBfCjld-b!z!%&kw={ zd)s{SPfV73UAUO<5vWWu=7%Q*MqX&m&v8+`HKij*x*cJE(S@X((-XXnqhpF!jd2P1 zCY_7h;?x0!N#3W5PL>jE?SX042!zycQtl$sapop;1^5dQ`FJ6o@?xa~EBx!__ACU; z24S%bm9ASwPEFZiFNAB_z0D20jC3YW!5NSDQt)J`xzz9-hZ|}fOy6b4om-&#(gU9{ zn8uWHXFQc%ycj8=XO$f$AJ?s8*@)b1keS))*=sIO5pSg8-<`gpo1QSu;;JOtZ<#8$ z67*P`yVL>}IZV_0wiv=Op$9B8)mwt_Ck*D!R%(dI@X2OOM8jG5O|j_XCjSYOz8or) zO&?DbO%L7ORkBP7gGi_KWMmw_?P{B)nN7{OZhxwVV*)RErW>m72;6UV)b*>3f+YXb zRi@^bxj5dQ61@y;r=QVz#^lPO@qxPG>Nk(`bYdV&)Xqs*_oE4)j0TB4ug20Ag%qUo zF1JlRcU=8Sj*3`D#gCB^s0rVzJRpm2SQn*`qh+4c!yi4N|Em)->>Yxtso#Uw?EXDb zbF+lWubHboA9x>xV@Y=pC*YZc+~;k((^)s*EwCPun~b?`Mf?E&y&_#gs814O998j_ zB9W;)4Vis*e`O&m7fYZnL*4dka;6TXd#aO!$Fd~u6~wA3D~iH%^FoG@N?a?(onu{H zPv&M&CgA0M%c0((m5wBMm>a42t}5kn8&HbD|GHN+FU zk}#V$zA3c@tg~ysIOG- z2_D*ptD&C3ya|3PeCMaZ6MQhJq#Yjwj!sRBa&igza5TxbChbK=kjtN)4JpjUCnpfK z$i?`$?zYc-EC=VKOHxh-CmZ+y3g)9i_|8e^SDoHuYSul%&LQVEN~v?u6C--#=zHd5 z%*(SpycSL3A;(x}JY8a(7d1!93eZKBh7pByY{)b7?2V z7m-BtOZ<2TJi6IZ^mt5ArrpCHx|k;a)$Ny)iQ0%SncsT8AgWg1JPJ-cSkH=={0@DC z&=Hs{QHCl1dh`uK;E)|x;ixL?A6pVmtBzB4w<(hTP&AZO;<=Sj4s1_$sAnHrtk{x# zvnVt9?IbNSML=&NATC?13NnYw4Ri8HdG&I|i-!b*^5aa-SWq3;`4}tt^(syQ2GU2hQ z{qNscVVNYEwr7~vaT0vTe`9LJ_BOO~6P~vX|3$9fBNP7c0bz zDxOY1mH_oWsT|+Dj>m7CwZTj2oP7uxKg`vf zd0bA2!>p6g5)y>s;E_zCC6b5@VUfHimB@(#;Tk2KA)OSyB| zmI$tsb=|z82ntl(!g93IdpgQz*l~h1+B(>ATj$7qXPal^w-Sli`WEWriIWU1Ke;jiK8vU&{v92$W7)v@0XG)eHX)UMg6s5 zCzL~+E*f7I4~|D(wc+=&gw2sK<-;V+y2a>14tr1!-3tu#|H}gCNrr&wqp%)x7ZJWT z#m6lpV(17m<-04BfsHCwquF~&Em1lVD*zN-6E6aT*gg?#$-}Zwtej1+q%B<`9z`BP zp81~1C2uLeHe_<HdRgsa@}h5K^TLhQNfJ~JAphP3UASwEIg2M#n1_?bkAJDHIl*rexqF-zlY5Bh@X z^M0v7pwz6UKVra2g61!_ZFT}zwGTLWK1GS;Y2s;mu`uY0$38reQjKF-U_6&h`c@tw zM%SneNg>RT)e9xxAfhm#ip-mA)rj|@G~}ClZuNXPn2%i_u7P@#&Tb+upGIU`LXclA zSG7Jn$h(gfdC)<+>$W|JNZx0v1f4kY(6|kljsEP|->)QR^$`dq+mMY2e$`ih4Wl5D zP3kpj3RGm>BQGN~)yxi3J>2zh<2!8oLdRM(@53ONg_wS$u6QU>mJgpmbUx~$)Sk>a zu@Xa6Jy4N_#&JF%Vt@HNq=Q2L0E=N*x^(|%_rcJQ-k?vDOJg?I{4tXkDB&L-R)e&M z0Vr=;gzc+K$9!_Gs1|`Oi9J|N1l{4Bz!A9puioJjO`72Z|DH&jqaZRNTZB%P06VU} zf%~rg$xYJvXYb&@1oq^xQzthVjy8nDt6c5Gy)EO=9+6t1{acb1&z|eG&U$ZSfY_)J z#i7mnUkMFocXI1^I2L{seu!D3eE_HEOeD=yC%V`{%jT_*^O4%LEF9u&)xC9!_sVPRmIQ4npJDlQA}FHQyv3R z(NZ=~@#v?s0#*f9#G$sXNK08VG5K$^eS5nmW<{a9(%;Xf8R&^|tWqU4HgJ$*xk-xZq^XtAW(_h><2E5bwB_R)OF*Nue(!WpVV*8mLyRtB_?pSH z+;{YE3}-$L2Q!*<9brcHyaKh;O_*6i1JZLW!0(h(j(4YYpOh0Hsgc^Q}YIwAkI5g%inw}6<%=l?J?_VYe3qwTaGb3bG+d4;;L$ByfhC1hXo3$9|Ta`kO&c9LsAy$+T zh6n?wg~17yR%o0XXjLo?CD#(y^uFqN%=jl}^qCK1CV?~L#}7#-!-VA!oSJW(E8E{J ziUQqFr|LQWe8hu~FZCe>ycK=DG39K@`P5QBsRS@XYoQ)u5+;$F_x~W5hN%K-QGNE~ zd~wP`L*@_+>FPfRa5vzVewYXjF-#moQ78VqqgZ4mI#9EV_2JHoKfmr{#7h;(>hK7} z#*6O*m7OR6A*sLKemD2$AXP(wYsPn9cKZW_k97~ZCGMMhfBrwljpW^Dy~)~0=IY5B zwD9e(ME2~@FM0X%mhczbdLJlYup_8yfLG-w*`G^Ary&JQw!r0fRnd1EZ#&+f3{S~y z+5fzO649@4@J`*LUoX*%nJT122AXE!M~WTg156MzNA>pzoIjYZ&-zFm#&-Z;qYSV} zrK9MhbqnAa^A7)?w*;PvaX5Z`GcS@$8BKKL}V1a(Po=M1)QP)X>)R|;kto%P`;*%nl3ku! zYJid}0uIZC+36Ypj7ok;N66}X&A(spe3!_|yY}Z_m{f76^X`jOG*=y@Epdik8wJP3 zpk|x7dynq@IUU9?u&S9sm^2>7Gv0g6WB$O6; z_=(2<>Hu6|u%Kh@KIf#sR2}=KaVQ6&G%R^zl3P|5|TfT5PK8!8K20kDl~xbFYjdMqM$ zse!`jTNsTOTNVH-!9^}(M0Y=B18fqw!Z%uALQ(SOAa)E5J$;N>4W?|BX#zkVceQ$_ z?;rl>$pc^3+*qE!)rYLsX8>>ympy>yYyhMm@(lYCjoXxo@ax}eLaoBo06Ssb|Evx- zpOx&c3bn@x2Bj0O5aBPHfbR7)_N>a)(fP%xc545Nu^blm?YZ`6zO77VyMvE0hOWEe z&)^K%@AA$Tyd7p7JMqSBjV@>YHonzVBMCHvXFc{x=oLWxSn8R@%X#r7c=WZ=0kdgb z$bdm^3@DzjM1)O_y{q2bTfbSJBZ6P-IQVB|@^}DTorU!zz>uL{uFt-2eD;VlOhjG$ z@cv$~j&Hne*+OjZM<8Fh?@1?LJGb<4E zm|hEv_HSSecr?y++ZtQo)mE+02-3?uYalY*DhZHVyizCKtLN)EIkk&^m;75LRkg{U z05YfwkT}RuG~IbRV`u~fq{;B_M?DPx40`_WL3gr%&>Ld3dQ2??y+boC$CYg5VQ8S| zIJwM+xwNv6!+Njz>~BPKR9D|VxnG3NDw3MbF>|kHZ$AzblL?$AC?a?^klYg zI_R-RLy&-D^-f1?p59lftyimPP_*c1NR+#IO`uH4GRN0#SLN2|Vl;L1`fM>r0>$gTxINJj4;Xcu}Vkngkycxq~*wqgwe$*X-PQ!UtC z8bxAc)_Nv@lx!dvV^z_sf?wqa00~ImcG7b#z5#`MH_>sW&8IVOPc3$WXs?V<2{&yw z3Q~twa{Mkf9%)5%4FW{tdH>LAiarB^GV1K>u3-w-0v!qGzfWg=L z9yC-4FoHJU4GOPkrpeBEtVE}E(QzLQ3K`7+u<01E&KWZ1#3))|^IZ*~Ll7%+4_K8A zM%r`5F-!Kpa>LmJ1Dn`wBI=ti=l6z~He_foH1s@IlgH3Z;kGZQsT72ebA{odRiWS~ zrPY~}Nh3*bg7^}^p|DJH$n=R3oNJE__yPuD&f?aieO4h2%uQ3%K~hCDFiUix>o@IE zFt4DC$MMhxoSmqHD88X7Ag_qGB`WIYYP1UnAMlC;yvX#2BiI1XcPe4rdNdA@R}Em9 zh1`Azk=6ytx;7DyA**^cT-Vu-9oW;RjAa|?wy<6^{E-*H1{g)d!(2`K9arA|Y7*xt zRS`le*+5b|V7ih1wqinvuoji0DtSoAkED0$2t{&jfuu9Q(};>6A{ecKNUh#c0#u87 zIp+hGya&?DQDH?m^smdGc&^SL?pX}Z%>fkj^jP(zq4#Qo>BSLW7vTfQ%GF}cbK-Y$ z4G&IV%K_V`k+ z4KQqqCEom)N$d@xo=1Y5cpqiTi=%~9errNw=8u?n`&WTWJSqvvayvwK>lsS{_c*wjJi%X+nw}Gg87o?vI=?ZSpY+*{Pl}DZsLL$1A&=Hw zkfcwFBa*qteY$nin^^60LrD6JYOW7cTL%Ket@DC0c3nWf;FMOzC7;UHsOO(B;KYj#OWMRNN4q6fIjDBY%#!UE&B%mb` zdb!ssIax0PWK5$#zekG1#YpU00|;o9fz+99{PM!&LIQL)yim8gT@LU84r?rk@u?nR z^gHfsVl0=)my5kRM85(2()-Y|<$P_+4cEY+b*FZAP3Fs_k^ePpH!{*Y68-HzsZlm| zD^!5V9g(VIV>HNp-a|z|7SVCub3H57+{krYXKcZHFRL@;%Sd`W6v(#3z5B0r+J4F4 z3p4ds_fXiH5u|q(=)^2qWY~|QGkwFehRFi7_y-ij=EB_DU60gNRk)j7M1T@^42Ykf zZjqE{KuyJwlU5HTGKJj!=lvUiUr_>qo#Z44)53>)E2z z-rE({-wIMTaRPyc874H+`*-&YP-$3s?RhU|?6Ib{}e4!bo!z%+n&qy07&c zXho776y0k&&+PhElr547`%ddGW)HOUz_TaWi!u`=fL?+kG{AbUDtGoCz|Yg>JZK=0 z)nrg1P(6#G`Fc3P4lq+HeY*UDXQY{Z-|rMOrAbSDjEv-q-6=OzW4 z`DAnDFNZV6ASMB_e8@V@JVi-#yIh8a4~vXo7~R;Qv$bXZULUhx~ZO){U zQ8aq$C`DMX;0^m751nuHfOKikzsk}&MtDsz!@l;d9?*Vu6m}0_6X$>XOj{4Etpj

-h;g8e;?tHLJ&PDGf%vfo4Brqj zuEd&^q$0Bq23k{WHh}uL7efEoebLwb^ky5VNvwgAxBu+x?5dxcG8Qb3bKzO`^L!h0 z)4@`QG>mOAjMmR1LDsKzvhD#V(#EcM`k|x^0_YVwfw~-zD5yiy2<{Vd(MT_FDqV9d zlOms7t>zwpYbbLx=$6BtGFYKI#0cRWZ`d%lpz8EUX_&{@W{*42p<=l_K9whZ8P0v% zD?;_`i)4C2*ubrf5Mta4R{cwqkW2zS&Mu3uLh%CK&NBZICH~}WCc;Xb) zw`J=Df^ls4ax->5g2TVlb3Memw$XTXIz*7!0BTzazvad1VXu z9ookCyZ`Y!d=H)M>4rtoe#cAb$Ar4qVJ~aqzVQhnE7izPIFa+?(tI_#x>Z6Wl5t0%35Nn$ki?|`%%2uHO4H!wM%lm(8} zeU=}cbH3>Pkx+gM

    FrA7bLDa8(aW!wZuOJL8boYu_8u)=6r z(SsS3eE`SF+#(FGA+lwhkaw7PS@%Ny#K^Mj2S2Kq%KJPSOUC&* z!oqSF=uA>&nHu>YCHrIehcQFI0YH007gE5z@Wbzed!9(|LzC8_-=aVj9|V5YmHe9g z=*=-WWT5$Nb%66vaM;U|=r@#)OzZ&(vNT1?1Lk#$f3dHqv+Omj5v-7IPxGuS~@xCaG7L3s|7WurMt%xdu))vz&47k50 zaDS?54OpP~gsR{q6+gvu7fxJX zV8SVAvYkN68y8z;>YT9px5R+_gZY{>77;VUU2;~pyet7 z4W2&ffHZ>X*K`ZC9kPm&9(9u%soFH4yHmxJ7ju6NZIy^#HL1bYv%R@U_t1J|7;0yn zsyY#l-{#kM1xSkPUOr>0e6O?*9uTd&@e*(h0V$sIjwt`v7j+4qgib7DXa)Vmo^QQ} zZh1~^O{xu^X8&~zI(Yncg$zhQe`LnAhGjwe*x`Y<=x9c^fKCzFizkq4Q-y#;Vf{;G z0^P}X-sV&@vj+4Z9s|9A`H5;G?9VE?h^0BcEXo7?ev{`EYo=?t+V^uF6j#0sR)+O_uvFXxteYJle#LN zEZ%ao9*=IvoOc;}^%h)2nCWxaYaQ}LmrUmXox@e{wOkgWdqS-N0W3QTjIK`XI4032 z4{^ys_#$j*0fgMPX3bb~#(7xH;)6ivyyn~MgfuN*;15R8I=-RDQpolWx+6{`-wgBb z)nyn`pmLHmj6|nm=#z?3;K(86ee3mUuDcZ>F!Ce1WmT5!T|A+==kxxz*YG&^WH27X z(2m*e{h{z%AV#Mx9K^WuKcgNW5Cj8f1F!5UA>&_>9u?_ww7|JT&uC0d#otY?H?l+{ znl_CmD$vy0mNg6RHFybNy7Epg-E64)H#qP<)X@$_chireGvn$lSffSLwJPO5Mv|Jk zTE7z0pFy%08?9;`6z(jVhWW(O_EXp=SdA3WUM1BHPabG)^8{Ldkr|qMB)-QrWxmh0 zf=AP*Ln$4aD!$o&5v6c_KHu1O_o1oJ_u^}x%h{@3qWq|`JuxBtL-Ce z_Ega|3WX42v+YQKKhiGbnUnn?xt7#3icDr7*VM@exQZ!q+ z%7(<-uuKBl_9ypJwAxAH0Jt)BbY=^fT$Y#M-{a(JnpRKSIR>`E&n6Z$&sg2Z0mXf_ z^Lcqrc}=RD{A9p7>5F=`&f@Ti8H@WDOqx1N+bn3uJoE6(pOop?g|hC@xL zSVqA2ZLpy9SIlp>EhRIAVvi>QBT!o#>3KUfqM*9n4jfrzHQzwW(N(pDS`~0YvPRV| zKkN8208?-ry6;ekOrMTSLE$`i^IMJ`59RT8Y5|VpEZy~itl>_XI`)^h&s3?PgazikupOwq-@X^A4sF0Z!gHU9G96k{b_u#t389e)A^w+qDN*vi$ zH4}QA_CP=TtJP)pD};hX0^TQor~ulnn;#BCg})}g<&xo>yJSBqk~sP1viYq@jW_3* z#9Rc3NQpp1s-n#R&O;#=4wW(wkJy1M!bo^n1<6RvL&e zTH5_?(4gfH9!N0W>my-*hp%{m@-$n+IQ&_7SVDo6o4WwdM=?4>__9k|ioPRxaB zFZRQk!T##>>VF;3k_YHPx~U7BZvOsB0oUZaKI!>xN+zTRE}Agvc08*8?>OHoZFsl8 zxD#LWsS(@Tx7pu;A5%!&!bmn1x{LVtoEDwt02D$3Sm{t2=98>pAHiLW`MMNB@%h33 zWPj+Zs}I0WKKLxa5{m7-7oc=H6)wPkf2O{R^IM791Ne;;en+DxW9I0i!5ii3MI*7x z!12_aYl;1)d7qfc#mg7{A{u!G6;96)e5cGVt&_bT-TqB6(1La4RZTg1otl)kA=3gD z0`qSr*Abks7luwBodAxa?J?bF;!2#~1xkz>jJ6sP_?%puDpp>0%_kEXf${jax|RmT zn9PIH-atUlc{nSQ_r+)j-L7O5<^W^?a5E*=(%0s!-ogd#zCEkY9ag=mgn2Uy#UdzN z#%sGt^ieY|r4ys*qZn{3!ii_|<#>$(tuMbQE?4W-VPSXdDj8*vmq&9+fDl#5a(27) zXJ+_epfL9?5UaglWYNaLhdnA)qsp8GXn!0ekRHbZoL^tWw4u)uRNuUol`kFwPQt*y zU+Zl6WsAPO|Np8u_kX7QK8_=Ia-Kt2Y-|WEnX_p*Y^`BhMJU%oh)QkDX}eNv8Df}Y zg~;h9F_mf--9-!~q(aIen^Q?Pb#=Jpy1sM$aNqYo@crTY_`E-#_xtsFzF+(Djz-=^ zZR=ZiX<#&%W-QcM|?(U%zJ&5_W|4)ozMvV84!FvemuNj|Aw*CeCZ$m-2E{scEDw|$hw&a zDr0uBZ()fnCCJ44opx!lH=FK1jojnhfSEnzH%|e2k;W+CeXCgVix#OHd+J)8S*qXK zV=2C~6L{2qcvMU6X??eDS-}qL%QwCf1>A}heZ3G^1tzmgH`K;R!dcYA7u{5 zjcisP*E^<#BWGrJI!fx(C}E3Nb7W$*uLy(Java(31glb)Q4`#Ns7cy+*i=!FktB&& z?0Lf#5O#f~OHzfoICtxy+6{WK%8JWb7Ifc;GSY9RF^>^-=V^zx?=YZ!5W`R>2(V4Gsw#_4j<0kcj zP`+F8K>72F7e@zBoNrUf$t6J9r0FO!Q_Qs?4O5!yK9gxZ3NhHA)PS$K*Y&E$nOuCJ zT}@{Tn}Jq(0rv!Y_{8mprA{btsq-cG+ApCKR>)nR9w5f<#hL<}_efW1Du6Gzzmkr9 zH6~9T$wLgjR+pS)hbLN5dD;))Sh{*FLS|*V4UWb{?*0w>XJX1Ub<q`L>Su!T{4+`Y}T&6V$$ zkf}2%(5IB?-k8X~hW`iqP?AGW_GW&OJzgo8i&Kzy%ak3=Xi=b(KwCxW8U+>DQ6uHv zyl4<*>jC-D=>@nblX2oMy^@xvM=y`ttS7sA&O*hb_+afip_TB6McOLNy$&-C7w<+L zVpCDr*AL?=hjbqSPW4os~}U_m0RflcRuwg+cf;Kd=dTVM3kFI1=4J1Fs(^U&NyVl3OX8WIsnjy)8D&s%rr<99&}6 z9lNGhA!BWT#x)xsI|^XyeCi5qqW!92*ZnsoIXqX;NhvqRdLPR>tzD(n_{oVM|H*mj z3$SkojJ5}HZgNN;p}Me69F5FpU$$!YS}#HxG&@OPkF@wZ%C~f9(zBA9`agCk3%98r zxAF%*MV9UxU12?$AuIg_( z0O8@yr455@x|p(fu3q#5-{B;fx8o!&Y`beM*flpF3H|K(Riq3omFUGyZiO}ugXF^1 z7&zp+p0+en-%B3{1r&x%C>UmNhTdkdwo8Uo>UQv%=WM~L+AKJ0>!TmfDG{9=Fa7ac zud(b%6*|-K&ko;9YnA~2z*je^yY2UcnPL99m}5D^Hi8w`Cc}mx0J&o`2rT=~a}mG} z*!vME)z9Oa@Zg6zRSP?wI~Ow=F9q^ywXF1Oan{imZ=YYf=DYLt6Uy%MAtYVkJr(LR zLPdIU0%DLVMWK2j%LZ!h%z}j-WHHMEKhIBa5M38}vS6$YEf!>-s%9bFubD8zckAL9 zmfM;g0CFCN%3OMT&(#LYqAM_1_77*3pVEYi zOHjB6xl0R%&pY_}^4Nbt+A3 zV)??i@8+^GQQI%x);Dghyj6n)juzA6==4W*^$}p|E3k$|!dfzj2&8Y2x}14T@u*h~9+|qxfyh6Nr8!EeyfS40 zq8f~lyg1_QW{5s)x`f+2CKec#dt{NS;ks~RwjjzM9pIuittjGSLQIyGJ34e?usc{2 z+xq+<#5pED!&?Ab8uLz**q4i?q|VhYodfP;AWXa=^mgk=08ZfqnZd(_Q zrOw6y>2}WUfFovR-fa)m*^s@Qlx`%7beN}hax73+-Dhzh{z&qn6FZsIMw%r^^;h9` zgI)k%k9(Pk&JK3dPoEOc#EsX9#bj~Uy@nnW#$*q+{jesl)LuT_``XBs)id8{2{$`0 z$2;}KV2L!I7o8blsS}_imN>;ij8)|Y|B?fCq>y<1b6W~Y*7?u&3nxC} zg)c;J5d7F|8>14*XrZ#W5&&JV`vTTyeCFYen%KlhxufF3%?mfcK@>v09_VxiRAI8H z%_Xc&ZOhS03`%ohFhIf n=hVK@y6eCApd3RD^gTH|7izPeDRTn|GnDbf5%+R2A Date: Tue, 4 Jul 2023 14:14:10 +0100 Subject: [PATCH 03/23] Flesh out BR performance guide --- docs/topic_guides/blocking_model_training.md | 130 +++++-------------- docs/topic_guides/blocking_performance.md | 85 ++++++++++++ docs/topic_guides/blocking_predictions.ipynb | 97 ++++++++++++++ mkdocs.yml | 1 + 4 files changed, 212 insertions(+), 101 deletions(-) create mode 100644 docs/topic_guides/blocking_performance.md create mode 100644 docs/topic_guides/blocking_predictions.ipynb diff --git a/docs/topic_guides/blocking_model_training.md b/docs/topic_guides/blocking_model_training.md index daad2a331f..6dd016e2e2 100644 --- a/docs/topic_guides/blocking_model_training.md +++ b/docs/topic_guides/blocking_model_training.md @@ -1,111 +1,39 @@ # Blocking for Model Training -## The purpose of the `blocking_rule` parameter on `estimate_parameters_using_expectation_maximisation` +Model Training Blocking Rules choose which record pairs from a dataset get considered when training a Splink model. These are used during Expectation Maximisation (EM), where we estimate the [m probability](./fellegi_sunter.md#m-probability) (in most cases). -The purpose of this blocking rule is to reduce the number of pairwise generated to a computationally-tractable number to enable the expectation maximisation algorithm to work. +The aim of Model Training Blocking Rules is to reduce the number of record pairs considered when training a Splink model in order to reduce the computational resource required. Each Training Blocking Rule define a training "block" of records which have a combination of matches and non-matches that are considered by Splink's Expectation Maximisation algorithm. -The expectation maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches. It works less effectively if there are very few examples of either matches or non-matches. It works less efficiently if there is a huge imbalance between the two (e.g. a billion non matches and only a hundred matches). +The Expectation Maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches. It works less efficiently if there is a huge imbalance between the two (e.g. a billion non matches and only a hundred matches). -It does not matter if this blocking rule excludes some true matches - it just needs to generate examples of matches and non matches. +Note: Unlike [Prediction Blocking Rules](./blocking_predictions.md), it does not matter if Training Blocking Rules excludes some true matches - it just needs to generate examples of matches and non-matches. -Since they serve different purposes, the blocking rules most appropriate to use with `blocking_rules_to_generate_predictions` will often be different to those for `estimate_parameters_using_expectation_maximisation`, but it is also common for the same rule to be used in both places. ## Using Training Blocking Rules in Splink -What is the difference between the list of `blocking_rules_to_generate_predictions` specifed in the Splink settings dictionary, and the blocking rule that must be provided as an argument to `estimate_parameters_using_expectation_maximisation`? - -These two kinds of blocking rules can be seen in the following code snippet: - -=== ":simple-duckdb: DuckDB" - ```python - import splink.duckdb.comparison_library as cl - - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } - - - linker = DuckDBLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) - - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - ``` -=== ":simple-apachespark: Spark" - ```python - import splink.spark.comparison_library as cl - - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } - - - linker = SparkLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) - - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - ``` -=== ":simple-amazonaws: Athena" - ```python - import splink.athena.comparison_library as cl - - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)", - "l.dob = r.dob", - ], - "comparisons": [ - cl.levenshtein_at_thresholds("first_name", 2), - cl.exact_match("surname"), - cl.exact_match("dob"), - cl.exact_match("city", term_frequency_adjustments=True), - cl.exact_match("email"), - ], - } - - - linker = AthenaLinker(df, settings) - linker.estimate_u_using_random_sampling(max_pairs=1e6) - - blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - blocking_rule_for_training = "l.dob = r.dob and l.city = r.city" - linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training) - - ``` - -The answer is that they serve different purposes. \ No newline at end of file +Blocking Rules for Model Training are used as a parameter in the `estimate_parameters_using_expectation_maximisation` function. After a `linker` object has been instantiated, you can estimate `m probability` with training sessions such as: + +```python + +blocking_rule_for_training = "l.first_name = r.first_name" +linker.estimate_parameters_using_expectation_maximisation( + blocking_rule_for_training + ) + +``` + +Here, we have defined a "block" of records where `first_name` are the same. As names are not unique, we can be pretty sure that there will be a combination of matches and non-matches in this "block" which is what is required for the EM algorithm. + +Matching only on `first_name` will likely generate a large "block" of pairwise comparisons which will take longer to run. In this case it may be worthwhile applying a tighter blocking rule to reduce runtime. For example, a match on `first_name` and `surname`: + +```python + +blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" +linker.estimate_parameters_using_expectation_maximisation( + blocking_rule_for_training + ) + +``` + +which will still have a combination of matches and non-matches, but fewer record pairs to consider. \ No newline at end of file diff --git a/docs/topic_guides/blocking_performance.md b/docs/topic_guides/blocking_performance.md new file mode 100644 index 0000000000..ea206ee9cd --- /dev/null +++ b/docs/topic_guides/blocking_performance.md @@ -0,0 +1,85 @@ +# Blocking Rule Performance + +When considering computational performance of blocking rules, there are two main drivers to address: + +- How may pairwise comparisons are generated +- How quickly each pairwise comparison takes to run + +Below we run through an example of how to address each of these drivers. + +## Tight vs loose Blocking Rules + +One way to reduce the number of comparisons being considered within a model is to apply tight (or strict) blocking rules. However, this can have a significant impact on the how well the Splink model works. + +In reality, we recommend getting a model up and running with strict Blocking Rules and incrementally loosening them to see the impact on the runtime and quality of the results. By starting with tight blocking rules, the linking process will run faster which will means you can iterate through model versions more quickly. + +??? example "Example - Incrementally loosening Prediction Blocking Rules" + + When choosing Prediction Blocking Rules, consider how `blocking_rules_to_generate_predictions` may be incrementally loosened. We may start with the following rule: + + `l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob`. + + This is a very strict rule, and will only create comparisons where full name and date of birth match. This has the advantage of creating few record comparisons, but the disadvantage that the rule will miss true matches where there are typos or nulls in any of these three fields. + + This blocking rule could be loosened to: + + `substr(l.first_name,1,1) = substr(r.first_name,1,1) and l.surname = r.surname and l.year_of_birth = r.year_of_birth` + + Now it allows for typos or aliases in the first name, so long as the first letter is the same, and errors in month or day of birth. + + Depending on the side of your input data, the rule could be further loosened to + + `substr(l.first_name,1,1) = substr(r.first_name,1,1) and l.surname = r.surname` + + or even + + `l.surname = r.surname` + + The user could use the `linker.count_num_comparisons_from_blocking_rule()` function to select which rule is appropriate for their data. + +## Efficient Blocking Rules + +While the number of pariwise comparisons is important for reducing the computation, it is also helpful to consider the efficiency of the Blocking Rules. There are a number of ways to define subsets of records (i.e. "blocks"), but they are not all computationally efficient. + +From a performance prespective, here we consider two classes of blocking rule: + +- Equi-join conditions +- Filter conditions + +### Equi-join Blocking Rules + +Equi-joins are simply equality conditions between records, e.g. + +`l.first_name = r.first_name` + +These equality-based blocking rules are extremely efficient and can be executed quickly, even on very large datasets. + +Equality-based blocking rules should be considered the default method for defining blocking rules and form the basis of the upcoming [Blocking Rules Library](https://github.com/moj-analytical-services/splink/pull/1370). + + +### Filter Blocking Rules + +Filter conditions refer to any Blocking Rule that isn't a simple equality between columns. E.g. + +`levenshtein(l.surname, r.surname) < 3` + +Similarity based blocking rules, such as the example above, are inefficient as the `levenshtein` function needs to be evaluated for all possible record comparisons before filtering out the pairs that do not satisfy the filter condition. + + +### Combining Blocking Rules Efficiently + +Just as how Blocking Rules can impact on performance, so can how they are combined. The most efficient Blocking Rules combinations are "AND" statements. E.g. + +`l.first_name = r.first_name AND l.surname = r.surname` + +"OR" statements are not as efficient and should be used sparingly. E.g. + +`l.first_name = r.first_name OR l.surname = r.surname` + + + +??? note "Spark-specific Further Reading" + + Given the ability to parallelise operations in Spark, there are some additional configuration options which can improve performance of blocking. Please refer to the Spark Performance Topic Guides for more information. + + Note: In Spark Equi-joins can also be referred to as **hashed** rules, and facilitates splitting the workload across multiple machines. \ No newline at end of file diff --git a/docs/topic_guides/blocking_predictions.ipynb b/docs/topic_guides/blocking_predictions.ipynb new file mode 100644 index 0000000000..48d4dcd580 --- /dev/null +++ b/docs/topic_guides/blocking_predictions.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Blocking Rules for Splink Predictions\n", + "\n", + "The purpose of these blocking rules is to try and ensure that pairwise record comparisons are generated for all true matches." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Prediction Blocking Rules in Splink\n", + "\n", + "Blocking Rules for Prediction are defined through the `blocking_rules_to_generate_predictions` parameter in the Settings dictionary of a model. For example:\n", + "\n", + "``` py hl_lines=\"3-6\"\n", + "settings = {\n", + " \"link_type\": \"dedupe_only\",\n", + " \"blocking_rules_to_generate_predictions\": [\n", + " \"l.first_name = r.first_name\",\n", + " \"l.surname = r.surname\",\n", + " ],\n", + " \"comparisons\": [\n", + " ctl.name_comparison(\"first_name\"),\n", + " ctl.name_comparison(\"surname\"),\n", + " ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n", + " cl.exact_match(\"city\", term_frequency_adjustments=True),\n", + " ctl.email_comparison(\"email\"),\n", + " ],\n", + "}\n", + "```\n", + "\n", + "will generate comparisons for all true matches where names match. But it would miss a true match where there was a typo in (say) the first name.\n", + "\n", + "In general, it is usually impossible to find a single rule which both:\n", + "\n", + "- Reduces the number of comparisons generated to a computatally tractable number\n", + "\n", + "- Ensures comparisons are generated for all true matches\n", + "\n", + "This is why `blocking_rules_to_generate_predictions` is a list. Suppose we also block on `postcode`:\n", + "\n", + "```python\n", + "settings_example = {\n", + " \"blocking_rules_to_generate_predictions\" [\n", + " \"l.first_name = r.first_name and l.surname = r.surname\",\n", + " \"l.postcode = r.postcode\"\n", + " ]\n", + "}\n", + "```\n", + "\n", + "We will now generate a pairwise comparison for the record where there was a typo in the first name, so long as there isn't also a difference in the postcode.\n", + "\n", + "By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes unlikely that a truly matching record would not be captured by at least one of the rules.\n", + "\n", + "Note that Splink automatically deduplicates the record comparisons it generates. So, in the example above, the `\"l.postcode = r.postcode\"` blocking rule generates only records comparisons that were not already captured by the `first_name` and `surname` rule." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Choosing Prediction Blocking Rules\n", + "\n", + "When defining blocking rules it is important to " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mkdocs.yml b/mkdocs.yml index bbead01669..3742efa606 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -124,6 +124,7 @@ nav: - What are Blocking Rules?: "topic_guides/blocking_rules.md" - Model Training Blocking Rules: "topic_guides/blocking_model_training.md" - Prediction Blocking Rules: "topic_guides/blocking_predictions.md" + - Computational Performance: "topic_guides/blocking_performance.md" - Comparing Records: - Defining and customising comparisons: "topic_guides/customising_comparisons.ipynb" - Out-of-the-box comparisons: "topic_guides/comparison_templates.ipynb" From 1c06ddb5feec1424884fef779148ec83b0184d70 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Tue, 4 Jul 2023 16:07:43 +0100 Subject: [PATCH 04/23] add choosing BRs guidance --- docs/topic_guides/blocking_model_training.md | 25 +++++++-- docs/topic_guides/blocking_predictions.md | 38 ++++++-------- docs/topic_guides/blocking_rules.md | 2 +- docs/topic_guides/drivers_of_performance.md | 54 ++------------------ mkdocs.yml | 5 +- splink/linker.py | 14 +++-- 6 files changed, 57 insertions(+), 81 deletions(-) diff --git a/docs/topic_guides/blocking_model_training.md b/docs/topic_guides/blocking_model_training.md index 6dd016e2e2..6bce368d88 100644 --- a/docs/topic_guides/blocking_model_training.md +++ b/docs/topic_guides/blocking_model_training.md @@ -6,10 +6,11 @@ The aim of Model Training Blocking Rules is to reduce the number of record pairs The Expectation Maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches. It works less efficiently if there is a huge imbalance between the two (e.g. a billion non matches and only a hundred matches). -Note: Unlike [Prediction Blocking Rules](./blocking_predictions.md), it does not matter if Training Blocking Rules excludes some true matches - it just needs to generate examples of matches and non-matches. +!!! note + Unlike [Prediction Rules](./blocking_predictions.md), it does not matter if Training Rules excludes some true matches - it just needs to generate examples of matches and non-matches. -## Using Training Blocking Rules in Splink +## Using Training Rules in Splink Blocking Rules for Model Training are used as a parameter in the `estimate_parameters_using_expectation_maximisation` function. After a `linker` object has been instantiated, you can estimate `m probability` with training sessions such as: @@ -36,4 +37,22 @@ linker.estimate_parameters_using_expectation_maximisation( ``` -which will still have a combination of matches and non-matches, but fewer record pairs to consider. \ No newline at end of file +which will still have a combination of matches and non-matches, but fewer record pairs to consider. + + +## Choosing Training Rules + +The idea behind Training Rules is to consider "blocks" of record pairs with a mixture of matches and non-matches. In practice, most blocking rules have a mixture of matches and non-matches so the primary consideration should be to reduce the runtime of model training by choosing Training Rules that reduce the number of record pairs in the training set. + +There are some tools within Splink to help choosing these rules. For example, the `count_num_comparisons_from_blocking_rule` gives the number of records pairs generated by a blocking rule: + +```py + +linker.count_num_comparisons_from_blocking_rule("l.first_name = r.first_name AND l.surname = r.surname") + +``` + +It is recommended that you run this function to check how many comparisons are generated before training a model so that you do not needlessly run a training session on billions of comparisons. + +!!! note + Unlike [Prediction Rules](./blocking_predictions.md), Training Rules are treated separately for each EM training session therefore the tota number of comparisons for Model Training is simply the sum of `count_num_comparisons_from_blocking_rule` across all Blocking Rules (as opposed to the result of `cumulative_comparisons_from_blocking_rules_records`). \ No newline at end of file diff --git a/docs/topic_guides/blocking_predictions.md b/docs/topic_guides/blocking_predictions.md index 5878d3a22a..df0dbf14a4 100644 --- a/docs/topic_guides/blocking_predictions.md +++ b/docs/topic_guides/blocking_predictions.md @@ -8,7 +8,7 @@ The aim of Prediction Blocking Rules are to: - Reduce the total number of comparisons being generated -## Using Prediction Blocking Rules in Splink +## Using Prediction Rules in Splink Blocking Rules for Prediction are defined through `blocking_rules_to_generate_predictions` in the Settings dictionary of a model. For example: @@ -51,43 +51,39 @@ We will now generate a pairwise comparison for the record where there was a typo By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes unlikely that a truly matching record would not be captured by at least one of the rules. -Note that Splink automatically deduplicates the record comparisons it generates. So, in the example above, the `"l.postcode = r.postcode"` blocking rule generates only records comparisons that were not already captured by the `first_name` and `surname` rule. +!!! note + Unlike [Training Rules](./blocking_model_training.md), Prediction Rules are considered collectively, and are order-dependent. So, in the example above, the `l.postcode = r.postcode` blocking rule only generates record comparisons that are a match on `postcode` were not already captured by the `first_name` and `surname` rule. -## Choosing Prediction Blocking Rules +## Choosing Prediction Rules When defining blocking rules it is important to consider the number of pairwise comparisons being generated your the blocking rules. There are a number of useful functions in Splink which can help with this. -Once a linker has been instatiated, we can use the `cumulative_num_comparisons_from_blocking_rules_chart` function to look at the cumulative number of comparisons generated by `blocking_rules_to_generate_predictions`: +Once a linker has been instatiated, we can use the `cumulative_num_comparisons_from_blocking_rules_chart` function to look at the cumulative number of comparisons generated by `blocking_rules_to_generate_predictions`. For example, a setting dictionary like this: ```py -from splink.duckdb.linker import DuckDBLinker -import splink.duckdb.comparison_template_library as ctl -import splink.duckdb.comparison_library as cl - -import pandas as pd - -df = pd.read_csv("splink/tests/datasets/fake_1000_from_splink_demos.csv") - settings = { - "link_type": "dedupe_only", "blocking_rules_to_generate_predictions": [ "l.first_name = r.first_name", "l.surname = r.surname", ], - "comparisons": [ - ctl.name_comparison("first_name"), - ctl.name_comparison("surname"), - ctl.date_comparison("dob", cast_strings_to_date=True), - cl.exact_match("city", term_frequency_adjustments=True), - ctl.email_comparison("email"), - ], } +``` +will generate the something like: +``` linker = DuckDBLinker(df, settings) linker.cumulative_num_comparisons_from_blocking_rules_chart() ``` ![](../img/blocking/cumulative_comparisons.png) -Where XXXXXXX something on number of records being order dependent. \ No newline at end of file +Where, similar to the note above, the `l.surname = r.surname` bar in light blue is a count of all record comparisons that match on `surname` that have not already been captured by the `first_name` rule. + +You can also return the underlying data for this chart using the `cumulative_comparisons_from_blocking_rules_records` function: + +```py +linker.cumulative_comparisons_from_blocking_rules_records() +``` +> [{'row_count': 2253, 'rule': 'l.first_name = r.first_name'}, +> {'row_count': 2568, 'rule': 'l.surname = r.surname'}] \ No newline at end of file diff --git a/docs/topic_guides/blocking_rules.md b/docs/topic_guides/blocking_rules.md index ba947d0cc4..678f9e8114 100644 --- a/docs/topic_guides/blocking_rules.md +++ b/docs/topic_guides/blocking_rules.md @@ -34,7 +34,7 @@ Instead, we can define a subset of potential comparisons using **Blocking Rules* ???+ "Further Reading" - For more information on blocking, please refer to XXXX + For more information on blocking, please refer to [this article](https://toolkit.data.gov.au/data-integration/data-integration-projects/probabilistic-linking.html#key-steps-in-probabilistic-linking) ### Choosing Blocking Rules diff --git a/docs/topic_guides/drivers_of_performance.md b/docs/topic_guides/drivers_of_performance.md index a53f3f795f..af0258d1d9 100644 --- a/docs/topic_guides/drivers_of_performance.md +++ b/docs/topic_guides/drivers_of_performance.md @@ -1,63 +1,19 @@ --- tags: - Performance - - Blocking --- -## Run times, performance, and linking large data -This topic guide covers the fundamental drivers of the run time of Splink jobs. It also describes the tools that are built into Splink that help you to understand how long a job is likely to take. +This topic guide covers the fundamental drivers of the run time of Splink jobs. -In summary, **your choice of blocking rules is by far the most important driver of performance.** +The primary driver of run time is **the number of record pairs that the Splink model has to process**. In Splink, the number of pairs to consider is reduced using **Blocking Rules** which are covered in depth in their own set of [topic guides](./blocking_rules.md). Additional factors which affect performance are: -- the complexity of your comparisons, whether you apply term frequency adjustments, +- the complexity of your comparisons e.g. whether you apply term frequency adjustments - whether you choose to set `retain_matching_columns` and `retain_intermediate_calculation_columns` to `True` in your settings, - whether you filter out comparisons with a match score below a given threshold (using a `threshold_match_probability` or `threshold_match_weight` when you call `predict()`). -### Blocking rules +## :simple-apachespark: Spark Performance -Blocking rules are the primary method for managing - -In most large datasets, it is computationally intractable to compare every row with every other row. - -The number of comparisons grows with the square of the number of input records, using the formula $\frac{n\left(n-1\right)}2$ . For instance, a million input records implies around 500bn comparisons. - -In Splink, we use a technique called blocking to dramatically reduce the number of comparisons by comparing only records that adhere to certain rules, such as that the first name and date of birth must be equal . Blocking is described further [here](https://toolkit.data.gov.au/Data_Linking_Information_Series_Sheet_4:_Probabilistic_linking.html). - -Even after blocking, the number of comparisons generated is usually much higher than the number of input records - often between 10 and 1,000 times higher. As a result, the performance of Splink is influenced most heavily by the number of comparisons generated by the blocking rules, rather than the number of input records. - -This is the case for both main uses of blocking rules in Splink: estimating parameters using expectation maximisation, and generating predictions. (See [here](https://moj-analytical-services.github.io/splink/topic_guides/blocking_rules.html) for more information on this distinction). - -#### How many comparisons will be generated by a blocking rule? - -The `linker.count_num_comparisons_from_blocking_rule()`, documented [here](https://moj-analytical-services.github.io/splink/linker.html#splink.linker.Linker.count_num_comparisons_from_blocking_rule) will compute the number of comparisons that will be generated from a blocking rule. - -Users are recommended to use this function before attempting linkage, since some blocking rules may imply trillions of comparisons, resulting in record linkage jobs which run for hours and never complete. - -In general, we recommend a strategy of starting with strict blocking rules, and gradually loosening them. Sticking to less than 10 million comparisons is a good place to start, before scaling jobs up to 100s of millions (DuckDB on a laptop), or sometimes billions (Athena or Spark). - -#### Examples of strict and loose blocking rules - -To give an example of how `blocking_rules_to_generate_predictions` rules may be incrementally loosened, we may start with the following rule: - -`l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob`. - -This is a very strict rule, and will only create comparisons where full name and date of birth match. This has the advantage of creating few record comparisons, but the disadvantage that the rule will miss true matches where there are typos or nulls in any of these three fields. - -This blocking rule could be loosened to: - -`substr(l.first_name,1,1) = substr(r.first_name,1,1) and l.surname = r.surname and l.year_of_birth = r.year_of_birth` - -Now it allows for typos or aliases in the first name, so long as the first letter is the same, and errors in month or day of birth. - -Depending on the side of your input data, the rule could be further loosened to - -`substr(l.first_name,1,1) = substr(r.first_name,1,1) and l.surname = r.surname` - -or even - -`l.surname = r.surname` - -The user could use the `linker.count_num_comparisons_from_blocking_rule()` function to select which rule is appropriate for their data. +As :simple-apachespark: Spark is designed to distribute processing across multiple machines so there are additional configuration options available to make jobs run more quickly. For more information, check out the [Spark Performance Topic Guide](./optimising_spark.md). \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 3742efa606..f96e076179 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -135,8 +135,9 @@ nav: - Term-Frequency adjustments: "topic_guides/term-frequency.md" - Performance: - Run times, performance and linking large data: "topic_guides/drivers_of_performance.md" - - Optimising Spark performance: "topic_guides/optimising_spark.md" - - Salting blocking rules: "topic_guides/salting.md" + - Spark Performance: + - Optimising Spark performance: "topic_guides/optimising_spark.md" + - Salting blocking rules: "topic_guides/salting.md" - Documentation: - Introduction: "documentation_index.md" - API: diff --git a/splink/linker.py b/splink/linker.py index 02f25ed6d3..82fa616bda 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2701,6 +2701,7 @@ def count_num_comparisons_from_blocking_rule( linker.count_num_comparisons_from_blocking_rule(br) ``` > 19387 + ```py br = "l.name = r.name and substr(l.dob,1,4) = substr(r.dob,1,4)" linker.count_num_comparisons_from_blocking_rule(br) @@ -2734,19 +2735,22 @@ def cumulative_comparisons_from_blocking_rules_records( for. If null, the rules set out in your settings object will be used. Examples: + Generate total comparisons from Blocking Rules defined in settings dictionary ```py linker_settings = DuckDBLinker(df, settings) # Compute the cumulative number of comparisons generated by the rules # in your settings object. linker_settings.cumulative_comparisons_from_blocking_rules_records() - >>> - # Generate total comparisons with custom blocking rules. + ``` + + Generate total comparisons with custom blocking rules. + ```py blocking_rules = [ "l.surname = r.surname", "l.first_name = r.first_name and substr(l.dob,1,4) = substr(r.dob,1,4)" ] - >>> + linker_settings.cumulative_comparisons_from_blocking_rules_records( blocking_rules ) @@ -2827,8 +2831,8 @@ def count_num_comparisons_from_blocking_rules_for_prediction(self, df_predict): Examples: ```py - linker = DuckDBLinker(df, connection=":memory:") - linker.load_settings("saved_settings.json") + linker = DuckDBLinker(df) + linker.load_model("settings.json") df_predict = linker.predict(threshold_match_probability=0.95) count_pairwise = linker.count_num_comparisons_from_blocking_rules_for_prediction(df_predict) count_pairwise.as_pandas_dataframe(limit=5) From 2df607b5180f654350f8e81ada3bb2af6b9b8730 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Tue, 4 Jul 2023 15:08:30 +0000 Subject: [PATCH 05/23] lint with black --- splink/linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/linker.py b/splink/linker.py index 82fa616bda..db52c982dc 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2701,7 +2701,7 @@ def count_num_comparisons_from_blocking_rule( linker.count_num_comparisons_from_blocking_rule(br) ``` > 19387 - + ```py br = "l.name = r.name and substr(l.dob,1,4) = substr(r.dob,1,4)" linker.count_num_comparisons_from_blocking_rule(br) From 47ff672200f36ab2477f030c4d87136808934b6b Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Tue, 4 Jul 2023 16:10:24 +0100 Subject: [PATCH 06/23] lint --- splink/linker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/splink/linker.py b/splink/linker.py index 82fa616bda..d72d2fce67 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2701,7 +2701,7 @@ def count_num_comparisons_from_blocking_rule( linker.count_num_comparisons_from_blocking_rule(br) ``` > 19387 - + ```py br = "l.name = r.name and substr(l.dob,1,4) = substr(r.dob,1,4)" linker.count_num_comparisons_from_blocking_rule(br) @@ -2735,7 +2735,8 @@ def cumulative_comparisons_from_blocking_rules_records( for. If null, the rules set out in your settings object will be used. Examples: - Generate total comparisons from Blocking Rules defined in settings dictionary + Generate total comparisons from Blocking Rules defined in settings + dictionary ```py linker_settings = DuckDBLinker(df, settings) # Compute the cumulative number of comparisons generated by the rules From ae411c03f09080e2bd1ba4d09732e634870c9d04 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Tue, 4 Jul 2023 15:11:07 +0000 Subject: [PATCH 07/23] lint with black --- splink/linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/linker.py b/splink/linker.py index d72d2fce67..6f948f4bce 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2735,7 +2735,7 @@ def cumulative_comparisons_from_blocking_rules_records( for. If null, the rules set out in your settings object will be used. Examples: - Generate total comparisons from Blocking Rules defined in settings + Generate total comparisons from Blocking Rules defined in settings dictionary ```py linker_settings = DuckDBLinker(df, settings) From 32cdc05d728b1c2b931e4e3ef2552e493dc96706 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Tue, 4 Jul 2023 16:23:12 +0100 Subject: [PATCH 08/23] remove blocking predictions notebook --- docs/topic_guides/blocking_predictions.ipynb | 97 -------------------- 1 file changed, 97 deletions(-) delete mode 100644 docs/topic_guides/blocking_predictions.ipynb diff --git a/docs/topic_guides/blocking_predictions.ipynb b/docs/topic_guides/blocking_predictions.ipynb deleted file mode 100644 index 48d4dcd580..0000000000 --- a/docs/topic_guides/blocking_predictions.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Blocking Rules for Splink Predictions\n", - "\n", - "The purpose of these blocking rules is to try and ensure that pairwise record comparisons are generated for all true matches." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Prediction Blocking Rules in Splink\n", - "\n", - "Blocking Rules for Prediction are defined through the `blocking_rules_to_generate_predictions` parameter in the Settings dictionary of a model. For example:\n", - "\n", - "``` py hl_lines=\"3-6\"\n", - "settings = {\n", - " \"link_type\": \"dedupe_only\",\n", - " \"blocking_rules_to_generate_predictions\": [\n", - " \"l.first_name = r.first_name\",\n", - " \"l.surname = r.surname\",\n", - " ],\n", - " \"comparisons\": [\n", - " ctl.name_comparison(\"first_name\"),\n", - " ctl.name_comparison(\"surname\"),\n", - " ctl.date_comparison(\"dob\", cast_strings_to_date=True),\n", - " cl.exact_match(\"city\", term_frequency_adjustments=True),\n", - " ctl.email_comparison(\"email\"),\n", - " ],\n", - "}\n", - "```\n", - "\n", - "will generate comparisons for all true matches where names match. But it would miss a true match where there was a typo in (say) the first name.\n", - "\n", - "In general, it is usually impossible to find a single rule which both:\n", - "\n", - "- Reduces the number of comparisons generated to a computatally tractable number\n", - "\n", - "- Ensures comparisons are generated for all true matches\n", - "\n", - "This is why `blocking_rules_to_generate_predictions` is a list. Suppose we also block on `postcode`:\n", - "\n", - "```python\n", - "settings_example = {\n", - " \"blocking_rules_to_generate_predictions\" [\n", - " \"l.first_name = r.first_name and l.surname = r.surname\",\n", - " \"l.postcode = r.postcode\"\n", - " ]\n", - "}\n", - "```\n", - "\n", - "We will now generate a pairwise comparison for the record where there was a typo in the first name, so long as there isn't also a difference in the postcode.\n", - "\n", - "By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes unlikely that a truly matching record would not be captured by at least one of the rules.\n", - "\n", - "Note that Splink automatically deduplicates the record comparisons it generates. So, in the example above, the `\"l.postcode = r.postcode\"` blocking rule generates only records comparisons that were not already captured by the `first_name` and `surname` rule." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Choosing Prediction Blocking Rules\n", - "\n", - "When defining blocking rules it is important to " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.9.12" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} From d39c70023d11f3e2d4ce54b4b8d34e1b7c3552c8 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Thu, 6 Jul 2023 17:56:28 +0100 Subject: [PATCH 09/23] improve topic guide folder structure --- .../{ => blocking}/blocking_rules.md | 0 .../model_training.md} | 0 .../performance.md} | 0 .../predictions.md} | 0 .../choosing_comparators.ipynb | 0 .../{ => comparisons}/comparators.md | 0 .../comparison_templates.ipynb | 0 .../customising_comparisons.ipynb | 0 .../{ => comparisons}/phonetic.md | 0 .../{ => comparisons}/term-frequency.md | 0 .../feature_engineering.md | 2 +- .../drivers_of_performance.md | 0 .../{ => performance}/optimising_spark.md | 0 .../topic_guides/{ => performance}/salting.md | 0 .../{ => splink_fundamentals}/backends.md | 0 .../{ => splink_fundamentals}/link_type.md | 0 .../querying_splink_results.md | 0 .../{ => splink_fundamentals}/settings.md | 0 .../{ => theory}/fellegi_sunter.md | 0 .../probabilistic_vs_deterministic.md | 0 .../{ => theory}/record_linkage.md | 0 mkdocs.yml | 42 +++++++++---------- 22 files changed, 22 insertions(+), 22 deletions(-) rename docs/topic_guides/{ => blocking}/blocking_rules.md (100%) rename docs/topic_guides/{blocking_model_training.md => blocking/model_training.md} (100%) rename docs/topic_guides/{blocking_performance.md => blocking/performance.md} (100%) rename docs/topic_guides/{blocking_predictions.md => blocking/predictions.md} (100%) rename docs/topic_guides/{ => comparisons}/choosing_comparators.ipynb (100%) rename docs/topic_guides/{ => comparisons}/comparators.md (100%) rename docs/topic_guides/{ => comparisons}/comparison_templates.ipynb (100%) rename docs/topic_guides/{ => comparisons}/customising_comparisons.ipynb (100%) rename docs/topic_guides/{ => comparisons}/phonetic.md (100%) rename docs/topic_guides/{ => comparisons}/term-frequency.md (100%) rename docs/topic_guides/{ => data_preparation}/feature_engineering.md (99%) rename docs/topic_guides/{ => performance}/drivers_of_performance.md (100%) rename docs/topic_guides/{ => performance}/optimising_spark.md (100%) rename docs/topic_guides/{ => performance}/salting.md (100%) rename docs/topic_guides/{ => splink_fundamentals}/backends.md (100%) rename docs/topic_guides/{ => splink_fundamentals}/link_type.md (100%) rename docs/topic_guides/{ => splink_fundamentals}/querying_splink_results.md (100%) rename docs/topic_guides/{ => splink_fundamentals}/settings.md (100%) rename docs/topic_guides/{ => theory}/fellegi_sunter.md (100%) rename docs/topic_guides/{ => theory}/probabilistic_vs_deterministic.md (100%) rename docs/topic_guides/{ => theory}/record_linkage.md (100%) diff --git a/docs/topic_guides/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md similarity index 100% rename from docs/topic_guides/blocking_rules.md rename to docs/topic_guides/blocking/blocking_rules.md diff --git a/docs/topic_guides/blocking_model_training.md b/docs/topic_guides/blocking/model_training.md similarity index 100% rename from docs/topic_guides/blocking_model_training.md rename to docs/topic_guides/blocking/model_training.md diff --git a/docs/topic_guides/blocking_performance.md b/docs/topic_guides/blocking/performance.md similarity index 100% rename from docs/topic_guides/blocking_performance.md rename to docs/topic_guides/blocking/performance.md diff --git a/docs/topic_guides/blocking_predictions.md b/docs/topic_guides/blocking/predictions.md similarity index 100% rename from docs/topic_guides/blocking_predictions.md rename to docs/topic_guides/blocking/predictions.md diff --git a/docs/topic_guides/choosing_comparators.ipynb b/docs/topic_guides/comparisons/choosing_comparators.ipynb similarity index 100% rename from docs/topic_guides/choosing_comparators.ipynb rename to docs/topic_guides/comparisons/choosing_comparators.ipynb diff --git a/docs/topic_guides/comparators.md b/docs/topic_guides/comparisons/comparators.md similarity index 100% rename from docs/topic_guides/comparators.md rename to docs/topic_guides/comparisons/comparators.md diff --git a/docs/topic_guides/comparison_templates.ipynb b/docs/topic_guides/comparisons/comparison_templates.ipynb similarity index 100% rename from docs/topic_guides/comparison_templates.ipynb rename to docs/topic_guides/comparisons/comparison_templates.ipynb diff --git a/docs/topic_guides/customising_comparisons.ipynb b/docs/topic_guides/comparisons/customising_comparisons.ipynb similarity index 100% rename from docs/topic_guides/customising_comparisons.ipynb rename to docs/topic_guides/comparisons/customising_comparisons.ipynb diff --git a/docs/topic_guides/phonetic.md b/docs/topic_guides/comparisons/phonetic.md similarity index 100% rename from docs/topic_guides/phonetic.md rename to docs/topic_guides/comparisons/phonetic.md diff --git a/docs/topic_guides/term-frequency.md b/docs/topic_guides/comparisons/term-frequency.md similarity index 100% rename from docs/topic_guides/term-frequency.md rename to docs/topic_guides/comparisons/term-frequency.md diff --git a/docs/topic_guides/feature_engineering.md b/docs/topic_guides/data_preparation/feature_engineering.md similarity index 99% rename from docs/topic_guides/feature_engineering.md rename to docs/topic_guides/data_preparation/feature_engineering.md index 13912b5a7d..28e9020c0c 100644 --- a/docs/topic_guides/feature_engineering.md +++ b/docs/topic_guides/data_preparation/feature_engineering.md @@ -22,7 +22,7 @@ Below are some examples of features that be created from common columns, and how A sensible approach to comparing postcodes is to consider their consituent components. For example, UK postcodes can be broken down into the following substrings: -![UK postcode components from https://ideal-postcodes.co.uk/guides/uk-postcode-format](../img/postcode_components.png) +![UK postcode components from https://ideal-postcodes.co.uk/guides/uk-postcode-format](.../img/postcode_components.png) See [image source](https://ideal-postcodes.co.uk/guides/uk-postcode-format) for more details. Splink already includes a pre-built [postcode comparison template](../comparison_template_library.md##splink.comparison_template_library.PostcodeComparisonBase) which does this for you, generating by default a comparison with levels for an exact match on full postcode, sector, district and area in turn. These individual postcode components are engineered under-the-hood using the `regex_extract` argument (see below and [comparison_templates.ipynb](comparison_templates.ipynb) for more details). diff --git a/docs/topic_guides/drivers_of_performance.md b/docs/topic_guides/performance/drivers_of_performance.md similarity index 100% rename from docs/topic_guides/drivers_of_performance.md rename to docs/topic_guides/performance/drivers_of_performance.md diff --git a/docs/topic_guides/optimising_spark.md b/docs/topic_guides/performance/optimising_spark.md similarity index 100% rename from docs/topic_guides/optimising_spark.md rename to docs/topic_guides/performance/optimising_spark.md diff --git a/docs/topic_guides/salting.md b/docs/topic_guides/performance/salting.md similarity index 100% rename from docs/topic_guides/salting.md rename to docs/topic_guides/performance/salting.md diff --git a/docs/topic_guides/backends.md b/docs/topic_guides/splink_fundamentals/backends.md similarity index 100% rename from docs/topic_guides/backends.md rename to docs/topic_guides/splink_fundamentals/backends.md diff --git a/docs/topic_guides/link_type.md b/docs/topic_guides/splink_fundamentals/link_type.md similarity index 100% rename from docs/topic_guides/link_type.md rename to docs/topic_guides/splink_fundamentals/link_type.md diff --git a/docs/topic_guides/querying_splink_results.md b/docs/topic_guides/splink_fundamentals/querying_splink_results.md similarity index 100% rename from docs/topic_guides/querying_splink_results.md rename to docs/topic_guides/splink_fundamentals/querying_splink_results.md diff --git a/docs/topic_guides/settings.md b/docs/topic_guides/splink_fundamentals/settings.md similarity index 100% rename from docs/topic_guides/settings.md rename to docs/topic_guides/splink_fundamentals/settings.md diff --git a/docs/topic_guides/fellegi_sunter.md b/docs/topic_guides/theory/fellegi_sunter.md similarity index 100% rename from docs/topic_guides/fellegi_sunter.md rename to docs/topic_guides/theory/fellegi_sunter.md diff --git a/docs/topic_guides/probabilistic_vs_deterministic.md b/docs/topic_guides/theory/probabilistic_vs_deterministic.md similarity index 100% rename from docs/topic_guides/probabilistic_vs_deterministic.md rename to docs/topic_guides/theory/probabilistic_vs_deterministic.md diff --git a/docs/topic_guides/record_linkage.md b/docs/topic_guides/theory/record_linkage.md similarity index 100% rename from docs/topic_guides/record_linkage.md rename to docs/topic_guides/theory/record_linkage.md diff --git a/mkdocs.yml b/mkdocs.yml index f96e076179..3c0c9c5176 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -110,34 +110,34 @@ nav: - Topic Guides: - Introduction: "topic_guides/topic_guides_index.md" - Record Linkage Theory: - - Why do we need record linkage?: "topic_guides/record_linkage.md" - - Probabilistic vs Deterministic linkage: "topic_guides/probabilistic_vs_deterministic.md" - - The Fellegi-Sunter Model: "topic_guides/fellegi_sunter.md" + - Why do we need record linkage?: "topic_guides/theory/record_linkage.md" + - Probabilistic vs Deterministic linkage: "topic_guides/theory/probabilistic_vs_deterministic.md" + - The Fellegi-Sunter Model: "topic_guides/theory/fellegi_sunter.md" - Linkage Models in Splink: - - Splink's SQL backends - Spark, DuckDB etc: "topic_guides/backends.md" - - Link type - linking vs deduping: "topic_guides/link_type.md" - - Defining Splink models: "topic_guides/settings.md" - - Retrieving and querying Splink results: "topic_guides/querying_splink_results.md" + - Splink's SQL backends - Spark, DuckDB etc: "topic_guides/splink_fundamentals/backends.md" + - Link type - linking vs deduping: "topic_guides/splink_fundamentals/link_type.md" + - Defining Splink models: "topic_guides/splink_fundamentals/settings.md" + - Retrieving and querying Splink results: "topic_guides/splink_fundamentals/querying_splink_results.md" - Data Preparation: - - Feature Engineering: "topic_guides/feature_engineering.md" + - Feature Engineering: "topic_guides/data_preparation/feature_engineering.md" - Blocking: - - What are Blocking Rules?: "topic_guides/blocking_rules.md" - - Model Training Blocking Rules: "topic_guides/blocking_model_training.md" - - Prediction Blocking Rules: "topic_guides/blocking_predictions.md" - - Computational Performance: "topic_guides/blocking_performance.md" + - What are Blocking Rules?: "topic_guides/blocking/blocking_rules.md" + - Model Training Blocking Rules: "topic_guides/blocking/model_training.md" + - Prediction Blocking Rules: "topic_guides/blocking/predictions.md" + - Computational Performance: "topic_guides/blocking/performance.md" - Comparing Records: - - Defining and customising comparisons: "topic_guides/customising_comparisons.ipynb" - - Out-of-the-box comparisons: "topic_guides/comparison_templates.ipynb" + - Defining and customising comparisons: "topic_guides/comparisons/customising_comparisons.ipynb" + - Out-of-the-box comparisons: "topic_guides/comparisons/comparison_templates.ipynb" - Comparing strings: - - Choosing comparators and thresholds: "topic_guides/choosing_comparators.ipynb" - - String comparators: "topic_guides/comparators.md" - - Phonetic transformations: "topic_guides/phonetic.md" - - Term-Frequency adjustments: "topic_guides/term-frequency.md" + - Choosing comparators and thresholds: "topic_guides/comparisons/choosing_comparators.ipynb" + - String comparators: "topic_guides/comparisons/comparators.md" + - Phonetic transformations: "topic_guides/comparisons/phonetic.md" + - Term-Frequency adjustments: "topic_guides/comparisons/term-frequency.md" - Performance: - - Run times, performance and linking large data: "topic_guides/drivers_of_performance.md" + - Run times, performance and linking large data: "topic_guides/performance/drivers_of_performance.md" - Spark Performance: - - Optimising Spark performance: "topic_guides/optimising_spark.md" - - Salting blocking rules: "topic_guides/salting.md" + - Optimising Spark performance: "topic_guides/performance/optimising_spark.md" + - Salting blocking rules: "topic_guides/performance/salting.md" - Documentation: - Introduction: "documentation_index.md" - API: From 61c814cc2b65ec44cc197e4ca25790629cde9792 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Jul 2023 10:41:18 +0100 Subject: [PATCH 10/23] fix relative paths --- docs/topic_guides/blocking/blocking_rules.md | 4 +--- docs/topic_guides/blocking/predictions.md | 2 +- .../comparisons/choosing_comparators.ipynb | 6 +++--- docs/topic_guides/comparisons/term-frequency.md | 14 +++++++------- .../data_preparation/feature_engineering.md | 2 +- docs/topic_guides/theory/fellegi_sunter.md | 4 ++-- .../theory/probabilistic_vs_deterministic.md | 4 ++-- docs/topic_guides/topic_guides_index.md | 12 ++++++------ 8 files changed, 23 insertions(+), 25 deletions(-) diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md index 678f9e8114..574386f0da 100644 --- a/docs/topic_guides/blocking/blocking_rules.md +++ b/docs/topic_guides/blocking/blocking_rules.md @@ -10,9 +10,7 @@ One of the main challenges to overcome in record linkage is the **scale** of the The number of pairs of records to compare grows using the formula $\frac{n\left(n-1\right)}2$, i.e. with (approximately) the square of the number of records, as shown in the following chart: - - -![](../img/blocking/pairwise_comparisons.png) +![](../../img/blocking/pairwise_comparisons.png) For example, a dataset of 1 million input records would generate around 500 billion pairwise record comparisons. diff --git a/docs/topic_guides/blocking/predictions.md b/docs/topic_guides/blocking/predictions.md index df0dbf14a4..bea04cb708 100644 --- a/docs/topic_guides/blocking/predictions.md +++ b/docs/topic_guides/blocking/predictions.md @@ -76,7 +76,7 @@ linker = DuckDBLinker(df, settings) linker.cumulative_num_comparisons_from_blocking_rules_chart() ``` -![](../img/blocking/cumulative_comparisons.png) +![](../../img/blocking/cumulative_comparisons.png) Where, similar to the note above, the `l.surname = r.surname` bar in light blue is a count of all record comparisons that match on `surname` that have not already been captured by the `first_name` rule. diff --git a/docs/topic_guides/comparisons/choosing_comparators.ipynb b/docs/topic_guides/comparisons/choosing_comparators.ipynb index 1b7970d4b1..04bb5f0707 100644 --- a/docs/topic_guides/comparisons/choosing_comparators.ipynb +++ b/docs/topic_guides/comparisons/choosing_comparators.ipynb @@ -394,7 +394,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![](../img/choosing_comparators/comparator_score_chart.png)" + "![](../../img/choosing_comparators/comparator_score_chart.png)" ] }, { @@ -706,7 +706,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![](../img/choosing_comparators/comparator__score_threshold_chart.png)" + "![](../../img/choosing_comparators/comparator__score_threshold_chart.png)" ] }, { @@ -1108,7 +1108,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![](../img/choosing_comparators/phonetic_match_chart.png)" + "![](../../img/choosing_comparators/phonetic_match_chart.png)" ] }, { diff --git a/docs/topic_guides/comparisons/term-frequency.md b/docs/topic_guides/comparisons/term-frequency.md index dc4f0157ca..61aa83bb35 100644 --- a/docs/topic_guides/comparisons/term-frequency.md +++ b/docs/topic_guides/comparisons/term-frequency.md @@ -10,7 +10,7 @@ tags: A common shortcoming of the Fellegi-Sunter model is that it doesn’t account for skew in the distributions of linking variables. One of the starkest examples is a binary variable such as gender in the prison population, where male offenders outnumber female offenders by 10:1. -![](../img/term_frequency/gender-distribution.png){width="800"} +![](../../img/term_frequency/gender-distribution.png){width="800"} #### How does this affect our m and u probabilities? @@ -26,7 +26,7 @@ In this example, one solution might be to create an extra comparison level for m However, this complexity forces us to estimate two m probabilities when one would do, and it becomes impractical if we extend to higher-cardinality variables like surname, requiring thousands of additional comparison levels. -![](../img/term_frequency/surname-distribution.png){width="800"} +![](../../img/term_frequency/surname-distribution.png){width="800"} This problem used to be addressed with an ex-post (after the fact) solution - once the linking is done, we have a look at the average match probability for each value in a column to determine which values tend to be stronger indicators of a match. If the average match probability for records pairs that share a surname is 0.2 but the average for the specific surname Smith is 0.1 then we know that the match weight for name should be adjusted downwards for Smiths. @@ -36,7 +36,7 @@ The shortcoming of this option is that in practice, the model training is conduc Below is an illustration of 2 datasets (10 records each) with skewed distributions of **first name**. A `link_and_dedupe` Splink model concatenates these two tables and deduplicates those 20 records. -![](../img/term_frequency/tf-intro.drawio.png) +![](../../img/term_frequency/tf-intro.drawio.png) In principle, u probabilities for a small dataset like this can be estimated directly - out of 190 possible pairwise comparisons, 77 of them have the same **first name**. Based on the assumption that matches are rare (i.e. the vast majority of these comparisons are non-matches), we use this as a direct estimate of u. Random sampling makes the same assumption, but by using a manageable-sized sample of a much larger dataset where it would be prohibitively costly to perform all possible comparisons (a Cartesian join). @@ -44,7 +44,7 @@ Once we have concatenated our input tables, it is useful to calculate the term f Building on the example above, we can define the m and u probabilities for a specific **first name** value, and work out an expression for the resulting match weight. -![](../img/term_frequency/calc.png) +![](../../img/term_frequency/calc.png) Just as we can add independent match weights for **name**, **DOB** and other comparisons (as shown in the Splink waterfall charts), we can also add an independent TF adjustment term for each comparison. This is useful because: @@ -56,17 +56,17 @@ Just as we can add independent match weights for **name**, **DOB** and other com - We can easily disentangle and visualise the aggregate significance of a particular column, separately from the deviations within it (see charts below) -![](../img/term_frequency/example.png){width="300"} +![](../../img/term_frequency/example.png){width="300"} ## Visualising TF Adjustments For an individual comparison of two records, we can see the impact of TF adjustments in the waterfall charts: -![This example shows two records having a match weight of +15.69 due to a match on **first name**, **surname** and **DOB**. Due to relatively uncommon values for all 3 of these, they each have an additional term frequency adjustment contributing around +5 to the final match weight](../img/term_frequency/waterfall.png) +![This example shows two records having a match weight of +15.69 due to a match on **first name**, **surname** and **DOB**. Due to relatively uncommon values for all 3 of these, they each have an additional term frequency adjustment contributing around +5 to the final match weight](../../img/term_frequency/waterfall.png) We can also see these match weights and TF adjustments summarised using a chart like the below to highlight common and uncommon names. We do this already using the Splink linker's profile_columns method, but once we know the u probabilities for our comparison columns, we can show these outliers in terms of their impact on match weight: -![In this example of names from FEBRL data used in the demo notebooks, we see that a match on first name has a match weight of +6. For an uncommon name like Portia this is increased, whereas a common name like Jack has a reduced match weight. This chart can be generated using `linker.tf_adjustment_chart("name")`](../img/term_frequency/tf-match-weight.png){width="800"} +![In this example of names from FEBRL data used in the demo notebooks, we see that a match on first name has a match weight of +6. For an uncommon name like Portia this is increased, whereas a common name like Jack has a reduced match weight. This chart can be generated using `linker.tf_adjustment_chart("name")`](../../img/term_frequency/tf-match-weight.png){width="800"} ## Applying TF adjustments in Splink diff --git a/docs/topic_guides/data_preparation/feature_engineering.md b/docs/topic_guides/data_preparation/feature_engineering.md index 28e9020c0c..7d2e82f860 100644 --- a/docs/topic_guides/data_preparation/feature_engineering.md +++ b/docs/topic_guides/data_preparation/feature_engineering.md @@ -22,7 +22,7 @@ Below are some examples of features that be created from common columns, and how A sensible approach to comparing postcodes is to consider their consituent components. For example, UK postcodes can be broken down into the following substrings: -![UK postcode components from https://ideal-postcodes.co.uk/guides/uk-postcode-format](.../img/postcode_components.png) +![UK postcode components from https://ideal-postcodes.co.uk/guides/uk-postcode-format](../../img/postcode_components.png) See [image source](https://ideal-postcodes.co.uk/guides/uk-postcode-format) for more details. Splink already includes a pre-built [postcode comparison template](../comparison_template_library.md##splink.comparison_template_library.PostcodeComparisonBase) which does this for you, generating by default a comparison with levels for an exact match on full postcode, sector, district and area in turn. These individual postcode components are engineered under-the-hood using the `regex_extract` argument (see below and [comparison_templates.ipynb](comparison_templates.ipynb) for more details). diff --git a/docs/topic_guides/theory/fellegi_sunter.md b/docs/topic_guides/theory/fellegi_sunter.md index a8b19b3980..4d8a332490 100644 --- a/docs/topic_guides/theory/fellegi_sunter.md +++ b/docs/topic_guides/theory/fellegi_sunter.md @@ -157,7 +157,7 @@ So, considering these properties, the total Match Weight for two observed record The Match Weight is the central metric showing the amount of evidence of a match is provided by each of the features in a model. The is most easily shown through Splink's Waterfall Chart: -![](../img/fellegi_sunter/waterfall.png) +![](../../img/fellegi_sunter/waterfall.png) 1️⃣ are the two records being compared @@ -207,7 +207,7 @@ It can be helpful to build up some inuition for how Match Weights translate into Plotting Match Probability versus Match Weight gives the following chart: -![](../img/fellegi_sunter/prob_v_weight.png) +![](../../img/fellegi_sunter/prob_v_weight.png) Some observations from this chart: diff --git a/docs/topic_guides/theory/probabilistic_vs_deterministic.md b/docs/topic_guides/theory/probabilistic_vs_deterministic.md index e86bf9af34..1b1f0859a8 100644 --- a/docs/topic_guides/theory/probabilistic_vs_deterministic.md +++ b/docs/topic_guides/theory/probabilistic_vs_deterministic.md @@ -57,11 +57,11 @@ Probabilistic Linkage is a **evidence-based** approach for joining records toget Linkage is probabilistic in the sense that it relies on the balance of evidence. In a large dataset, observing that two records match on the full name 'Robert Smith' provides some evidence that these two records may refer to the same person, but this evidence is inconclusive. However, the cumulative evidence from across multiple features within the dataset (e.g. date of birth, home address, email address) can provide conclusive evidence of a match. The evidence for a match is commonly represented as a probability. For example, putting the first 2 records of the table above through a probabilistic model gives a an overall probability that the records are a match: -![](../img/probabilistic_vs_deterministic/probabilistic_example.png) +![](../../img/probabilistic_vs_deterministic/probabilistic_example.png) In addition, the breakdown of this probability by the evidence provided by each feature can be shown through a waterfall chart: -![](../img/probabilistic_vs_deterministic/simplified_waterfall.png) +![](../../img/probabilistic_vs_deterministic/simplified_waterfall.png) Given these probabilities, unlike (binary) Deterministic linkage, the user can choose an **evidence threshold** for what they consider a match before creating a new unique identifier. diff --git a/docs/topic_guides/topic_guides_index.md b/docs/topic_guides/topic_guides_index.md index f7fc09946f..695e77f2ee 100644 --- a/docs/topic_guides/topic_guides_index.md +++ b/docs/topic_guides/topic_guides_index.md @@ -4,12 +4,12 @@ This section contains in-depth guides on a variety of topics and concepts within The topic guides are broken up into the following categories: -1. [Record Linkage Theory](record_linkage.md) - for an introduction to data linkage from a theoretical perspective, and to help build some intuition around the parameters being estimated in Splink models. -2. [Linkage Models in Splink](backends.md) - for an introduction to the building blocks of a Splink model. Including the supported SQL Backends and how to define a model with a Splink Settings dictionary. -3. [Data Preparation](./feature_engineering.md) - for guidance on perparing your data for linkage. Including guidance on feature engineering to help improve Splink models. -4. [Comparing Records](./customising_comparisons.ipynb) - for guidance on defining `Comparison`s withing a Splink model. Including how comparing records are structured within `Comparison`s, how to utilise string comparators for fuzzy matching and how deal with skewed data with Term Frequency Adjustments. -5. [Blocking](./blocking_rules.md) - for an introduction to Blocking Rules and their purpose within record linkage. Including how blocking rules are used in different contexts within Splink. +1. [Record Linkage Theory](theory/record_linkage.md) - for an introduction to data linkage from a theoretical perspective, and to help build some intuition around the parameters being estimated in Splink models. +2. [Linkage Models in Splink](splink_fundamentals/backends.md) - for an introduction to the building blocks of a Splink model. Including the supported SQL Backends and how to define a model with a Splink Settings dictionary. +3. [Data Preparation](data_preparation/feature_engineering.md) - for guidance on perparing your data for linkage. Including guidance on feature engineering to help improve Splink models. +4. [Comparing Records](comparisons/customising_comparisons.ipynb) - for guidance on defining `Comparison`s withing a Splink model. Including how comparing records are structured within `Comparison`s, how to utilise string comparators for fuzzy matching and how deal with skewed data with Term Frequency Adjustments. +5. [Blocking](blocking/blocking_rules.md) - for an introduction to Blocking Rules and their purpose within record linkage. Including how blocking rules are used in different contexts within Splink. 6. Model Training - for guidance on the methods for training a Splink model, and how to choose them for specific use cases. (Coming soon) 7. Clustering - for guidance on how records are clustered together. (Coming Soon) 8. Model QA - for guidance on how to Quality Assure a Splink model & the resulting clusters. Including Clerical Labelling. (Coming Soon) -9. [Performance](./optimising_spark.md) - for guidance on how to make Splink models run more efficiently. \ No newline at end of file +9. [Performance](performance/optimising_spark.md) - for guidance on how to make Splink models run more efficiently. \ No newline at end of file From 7f8b61a9559c1c2f486077e9571c7890999c90ee Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Jul 2023 10:49:03 +0100 Subject: [PATCH 11/23] tight -> strict --- docs/topic_guides/blocking/model_training.md | 2 +- docs/topic_guides/blocking/performance.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/topic_guides/blocking/model_training.md b/docs/topic_guides/blocking/model_training.md index 6bce368d88..641349e233 100644 --- a/docs/topic_guides/blocking/model_training.md +++ b/docs/topic_guides/blocking/model_training.md @@ -26,7 +26,7 @@ linker.estimate_parameters_using_expectation_maximisation( Here, we have defined a "block" of records where `first_name` are the same. As names are not unique, we can be pretty sure that there will be a combination of matches and non-matches in this "block" which is what is required for the EM algorithm. -Matching only on `first_name` will likely generate a large "block" of pairwise comparisons which will take longer to run. In this case it may be worthwhile applying a tighter blocking rule to reduce runtime. For example, a match on `first_name` and `surname`: +Matching only on `first_name` will likely generate a large "block" of pairwise comparisons which will take longer to run. In this case it may be worthwhile applying a stricter blocking rule to reduce runtime. For example, a match on `first_name` and `surname`: ```python diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index ea206ee9cd..e600a895f9 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -7,15 +7,15 @@ When considering computational performance of blocking rules, there are two main Below we run through an example of how to address each of these drivers. -## Tight vs loose Blocking Rules +## Strict vs lenient Blocking Rules -One way to reduce the number of comparisons being considered within a model is to apply tight (or strict) blocking rules. However, this can have a significant impact on the how well the Splink model works. +One way to reduce the number of comparisons being considered within a model is to apply strict blocking rules. However, this can have a significant impact on the how well the Splink model works. -In reality, we recommend getting a model up and running with strict Blocking Rules and incrementally loosening them to see the impact on the runtime and quality of the results. By starting with tight blocking rules, the linking process will run faster which will means you can iterate through model versions more quickly. +In reality, we recommend getting a model up and running with strict Blocking Rules and incrementally loosening them to see the impact on the runtime and quality of the results. By starting with strict blocking rules, the linking process will run faster which will means you can iterate through model versions more quickly. ??? example "Example - Incrementally loosening Prediction Blocking Rules" - When choosing Prediction Blocking Rules, consider how `blocking_rules_to_generate_predictions` may be incrementally loosened. We may start with the following rule: + When choosing Prediction Blocking Rules, consider how `blocking_rules_to_generate_predictions` may be made incrementally less strict. We may start with the following rule: `l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob`. From e8a4183709ed4dc1f9e69fc839e81d06f7c8e26a Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Jul 2023 11:28:27 +0100 Subject: [PATCH 12/23] add extremes example --- docs/topic_guides/blocking/blocking_rules.md | 23 ++++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md index 574386f0da..68f882f5cf 100644 --- a/docs/topic_guides/blocking/blocking_rules.md +++ b/docs/topic_guides/blocking/blocking_rules.md @@ -24,11 +24,11 @@ Considering a dataset of 1 million records, comparing each record against all of Instead, we can define a subset of potential comparisons using **Blocking Rules**. These are rules that define "blocks" of comparisons that should be considered. For example, the blocking rule: - `"l.first_name = r.first_name and l.surname = r.surname"` +`"l.first_name = r.first_name and l.surname = r.surname"` - will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. +will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. - Within a Splink model, you can specify multiple "blocks" through multiple Blocking Rules to ensure all potential matches are considered. +Within a Splink model, you can specify multiple "blocks" through multiple Blocking Rules to ensure all potential matches are considered. ???+ "Further Reading" @@ -36,13 +36,22 @@ Instead, we can define a subset of potential comparisons using **Blocking Rules* ### Choosing Blocking Rules - The blocking process is a compromise between the amount of **compuational resource** used when comparing records and **capturing all true matches**. +The blocking process is a compromise between the amount of **compuational resource** used when comparing records and **capturing all true matches**. - Even after blocking, the number of comparisons generated is usually much higher than the number of input records - often between 10 and 1,000 times higher. As a result, the performance of Splink is heavily influenced by the number of comparisons generated by the blocking rules, rather than the number of input records. +Even after blocking, the number of comparisons generated is usually much higher than the number of input records - often between 10 and 1,000 times higher. As a result, the performance of Splink is heavily influenced by the number of comparisons generated by the blocking rules, rather than the number of input records. - Getting the balance right between compuational resource and capturing matches can be tricky, and is largely dependent on the specific datasets and use case of the linkage. In general, we recommend a strategy of starting with strict blocking rules, and gradually loosening them. Sticking to less than 10 million comparisons is a good place to start, before scaling jobs up to 100s of millions (:simple-duckdb: DuckDB on a laptop), or sometimes billions (:simple-apachespark: Spark or :simple-amazonaws: Athena). +Getting the balance right between compuational resource and capturing matches can be tricky, and is largely dependent on the specific datasets and use case of the linkage. In general, we recommend a strategy of starting with strict blocking rules, and gradually loosening them. Sticking to less than 10 million comparisons is a good place to start, before scaling jobs up to 100s of millions (:simple-duckdb: DuckDB on a laptop), or sometimes billions (:simple-apachespark: Spark or :simple-amazonaws: Athena). - Guidance for choosing Blocking Rules can be found in the two [Blocking in Splink](#blocking-in-splink) topic guides. +Guidance for choosing Blocking Rules can be found in the two [Blocking in Splink](#blocking-in-splink) topic guides. + +!!! note "Taking blocking to the extremes" + If you have a large dataset to deduplicate, let's consider the implications of two cases of taking blocking to the extremes: + + **Not enough blocking** (ensuring all matches are captured) + There will be too many record pairs to consider, which will take an extremely long time to run (hours/days) or the process will be so large that it crashes. + + **Too much blocking** (minimising computational resource) + There won't be enough records pairs to consider, so the model won't perform well (or will struggle to be trained at all). ## Blocking in Splink From 3a59c4911511b0b7c3a4e4b9abd812dbaee9a500 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Jul 2023 13:21:27 +0100 Subject: [PATCH 13/23] Fix performance section --- docs/topic_guides/performance/optimising_spark.md | 3 ++- docs/topic_guides/topic_guides_index.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/topic_guides/performance/optimising_spark.md b/docs/topic_guides/performance/optimising_spark.md index 1c412bac08..ac8e58947c 100644 --- a/docs/topic_guides/performance/optimising_spark.md +++ b/docs/topic_guides/performance/optimising_spark.md @@ -37,7 +37,8 @@ linker = SparkLinker( Splink uses an iterative algorithm for model training, and more generally, lineage is long and complex. We have found that big jobs fail to complete without further optimisation. This is a [well-known problem](https://www.pdl.cmu.edu/PDL-FTP/Storage/CMU-PDL-18-101.pdf): -> > > "This long lineage bottleneck is widely known by sophisticated Spark application programmers. A common practice for dealing with long lineage is to have the application program strategically checkpoint RDDs at code locations that truncate much of the lineage for checkpointed data and resume computation immediately from the checkpoint." +!!! quote + "This long lineage bottleneck is widely known by sophisticated Spark application programmers. A common practice for dealing with long lineage is to have the application program strategically checkpoint RDDs at code locations that truncate much of the lineage for checkpointed data and resume computation immediately from the checkpoint." Splink will automatically break lineage in sensible places. We have found in practice that, when running Spark jobs backed by AWS S3, the fastest method of breaking lineage is persisting outputs to `.parquet` file. diff --git a/docs/topic_guides/topic_guides_index.md b/docs/topic_guides/topic_guides_index.md index 773825333a..cd8d619811 100644 --- a/docs/topic_guides/topic_guides_index.md +++ b/docs/topic_guides/topic_guides_index.md @@ -12,4 +12,4 @@ The topic guides are broken up into the following categories: 6. Model Training - for guidance on the methods for training a Splink model, and how to choose them for specific use cases. (Coming soon) 7. Clustering - for guidance on how records are clustered together. (Coming Soon) 8. Model QA - for guidance on how to Quality Assure a Splink model & the resulting clusters. Including Clerical Labelling. (Coming Soon) -9. [Performance](performance/optimising_spark.md) - for guidance on how to make Splink models run more efficiently. +9. [Performance](performance/drivers_of_performance.md) - for guidance on how to make Splink models run more efficiently. From c8689b3b9409c391b963b73f0c1cd58732495bee Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Jul 2023 15:47:47 +0100 Subject: [PATCH 14/23] add options for "new" flag --- mkdocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index adcf5b7d10..e268a0dbf2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,7 @@ theme: - navigation.tabs.sticky - navigation.top - toc.follow + - navigation.prune logo: "img/favicon.ico" favicon: "img/favicon.ico" palette: @@ -200,5 +201,6 @@ extra: link: https://pypi.org/project/splink/ - icon: fontawesome/solid/chevron-right link: https://www.robinlinacre.com/ + new: Recently added From c26fee94f508bcc67303c90c53bd02262bc173a9 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:55:22 +0100 Subject: [PATCH 15/23] Update docs/topic_guides/blocking/performance.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index e600a895f9..f3c6c140db 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -46,7 +46,7 @@ From a performance prespective, here we consider two classes of blocking rule: - Equi-join conditions - Filter conditions -### Equi-join Blocking Rules +### Equi-join Conditions Equi-joins are simply equality conditions between records, e.g. From f44e22ab60e3ba9225d3f6fa0e2cfb313b8e41c8 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:55:43 +0100 Subject: [PATCH 16/23] Update docs/topic_guides/blocking/performance.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index f3c6c140db..13a694631d 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -57,7 +57,7 @@ These equality-based blocking rules are extremely efficient and can be executed Equality-based blocking rules should be considered the default method for defining blocking rules and form the basis of the upcoming [Blocking Rules Library](https://github.com/moj-analytical-services/splink/pull/1370). -### Filter Blocking Rules +### Filter Conditions Filter conditions refer to any Blocking Rule that isn't a simple equality between columns. E.g. From 2414a6dd10d3ab6615655c4dccb3a3308ef07065 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:56:01 +0100 Subject: [PATCH 17/23] Update docs/topic_guides/blocking/performance.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index 13a694631d..46df6a6bf6 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -82,4 +82,4 @@ Just as how Blocking Rules can impact on performance, so can how they are combin Given the ability to parallelise operations in Spark, there are some additional configuration options which can improve performance of blocking. Please refer to the Spark Performance Topic Guides for more information. - Note: In Spark Equi-joins can also be referred to as **hashed** rules, and facilitates splitting the workload across multiple machines. \ No newline at end of file + Note: In Spark Equi-joins are implemented using hash partitioning, which facilitates splitting the workload across multiple machines. \ No newline at end of file From db554bb0d7133446ba1e5ad6aa741415f24234c0 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:56:46 +0100 Subject: [PATCH 18/23] Update docs/topic_guides/blocking/performance.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index 46df6a6bf6..2d906a8765 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -76,7 +76,7 @@ Just as how Blocking Rules can impact on performance, so can how they are combin `l.first_name = r.first_name OR l.surname = r.surname` - +In most SQL engines, an `OR` condition within a blocking rule will result in all possible record comparisons being generated. That is, the whole blocking rule becomes a filter condition rather than an equi-join condition, so these should be avoided. For further information, see [here](https://github.com/moj-analytical-services/splink/discussions/1417#discussioncomment-6420575). ??? note "Spark-specific Further Reading" From 1d779d75789413799e86d3af70a481426495d2ed Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:57:05 +0100 Subject: [PATCH 19/23] Update docs/topic_guides/blocking/blocking_rules.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/blocking_rules.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md index 68f882f5cf..4d49670d89 100644 --- a/docs/topic_guides/blocking/blocking_rules.md +++ b/docs/topic_guides/blocking/blocking_rules.md @@ -28,7 +28,7 @@ Instead, we can define a subset of potential comparisons using **Blocking Rules* will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. -Within a Splink model, you can specify multiple "blocks" through multiple Blocking Rules to ensure all potential matches are considered. +Within a Splink model, you can specify multiple Blocking Rules to ensure all potential matches are considered. These are provided as a list. Splink will then produce all record comparisons that satisfy at least one of your blocking rules. ???+ "Further Reading" From be7e80c5eef16b6e81b0921c216871cbcb25dee4 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Fri, 14 Jul 2023 08:57:34 +0100 Subject: [PATCH 20/23] Update docs/topic_guides/blocking/blocking_rules.md Co-authored-by: Robin Linacre --- docs/topic_guides/blocking/blocking_rules.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md index 4d49670d89..8a886ff21d 100644 --- a/docs/topic_guides/blocking/blocking_rules.md +++ b/docs/topic_guides/blocking/blocking_rules.md @@ -26,7 +26,7 @@ Instead, we can define a subset of potential comparisons using **Blocking Rules* `"l.first_name = r.first_name and l.surname = r.surname"` -will generate pairwise record comparisons amongst pairwise comparisons where first name and surname match. +will generate only those pairwise record comparisons where first name and surname match. Within a Splink model, you can specify multiple Blocking Rules to ensure all potential matches are considered. These are provided as a list. Splink will then produce all record comparisons that satisfy at least one of your blocking rules. From b166c506446e268d01b802d88bfc17ac327ed31f Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Mon, 17 Jul 2023 14:47:20 +0100 Subject: [PATCH 21/23] additional wording --- docs/topic_guides/blocking/blocking_rules.md | 12 ++++++++---- docs/topic_guides/blocking/predictions.md | 2 ++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md index 8a886ff21d..1c811e1878 100644 --- a/docs/topic_guides/blocking/blocking_rules.md +++ b/docs/topic_guides/blocking/blocking_rules.md @@ -36,7 +36,11 @@ Within a Splink model, you can specify multiple Blocking Rules to ensure all pot ### Choosing Blocking Rules -The blocking process is a compromise between the amount of **compuational resource** used when comparing records and **capturing all true matches**. +The aim of blocking rules is to recover all matching record pairs (i.e to have **high recall**). + +It is less important if the blocking rules select some (or even many) record pairs which are not matches (i.e. **high precision**). Record comparisons that 'pass' the blocking rules are then put forward to the scoring/prediction step. The more pairs let through, the more computation is required at the prediction step. + +Ultimately, the blocking process is a compromise between the amount of **compuational resource** used when comparing records and **capturing all true matches**. Even after blocking, the number of comparisons generated is usually much higher than the number of input records - often between 10 and 1,000 times higher. As a result, the performance of Splink is heavily influenced by the number of comparisons generated by the blocking rules, rather than the number of input records. @@ -58,8 +62,8 @@ Guidance for choosing Blocking Rules can be found in the two [Blocking in Splink There are two areas in Splink where blocking is used: -- [Training a Splink model](./blocking_model_training.md) -- [Making Predictions from a Splink model](./blocking_predictions.md) +- The first is to [generate pairwise comparisons when finding links](./blocking_predictions.md) (running `predict()`). This is the sense in which 'blocking' is usually understood in the context of record linkage -each of which is described in their own, dedicated topic guide. +- The second is a less familiar application of blocking: using it for [model training](./blocking_model_training.md). +each of which is described in their own, dedicated topic guide. diff --git a/docs/topic_guides/blocking/predictions.md b/docs/topic_guides/blocking/predictions.md index bea04cb708..cc6f429d67 100644 --- a/docs/topic_guides/blocking/predictions.md +++ b/docs/topic_guides/blocking/predictions.md @@ -47,6 +47,8 @@ settings_example = { } ``` +This generates all pairwise comparisons that satisfy at least one of the rules. + We will now generate a pairwise comparison for the record where there was a typo in the first name, so long as there isn't also a difference in the postcode. By specifying a variety of `blocking_rules_to_generate_predictions`, it becomes unlikely that a truly matching record would not be captured by at least one of the rules. From c762a61b4af067bb1696a8101f5bed9f9c2df0d2 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Mon, 17 Jul 2023 15:25:42 +0100 Subject: [PATCH 22/23] updates for brl --- docs/topic_guides/blocking/model_training.md | 17 ++++++++++++----- docs/topic_guides/blocking/performance.md | 4 +++- docs/topic_guides/blocking/predictions.md | 18 ++++++++++++------ mkdocs.yml | 2 +- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/docs/topic_guides/blocking/model_training.md b/docs/topic_guides/blocking/model_training.md index 641349e233..68d5570046 100644 --- a/docs/topic_guides/blocking/model_training.md +++ b/docs/topic_guides/blocking/model_training.md @@ -16,8 +16,9 @@ The Expectation Maximisation algorithm seems to work best when the pairwise reco Blocking Rules for Model Training are used as a parameter in the `estimate_parameters_using_expectation_maximisation` function. After a `linker` object has been instantiated, you can estimate `m probability` with training sessions such as: ```python +import splink.duckdb.blocking_rule_library as brl -blocking_rule_for_training = "l.first_name = r.first_name" +blocking_rule_for_training = brl.exact_match_rule("first_name") linker.estimate_parameters_using_expectation_maximisation( blocking_rule_for_training ) @@ -30,7 +31,10 @@ Matching only on `first_name` will likely generate a large "block" of pairwise c ```python -blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname" +blocking_rule_for_training = brl.and_( + brl.exact_match_rule("first_name"), + brl.exact_match_rule("surname") + ) linker.estimate_parameters_using_expectation_maximisation( blocking_rule_for_training ) @@ -47,10 +51,13 @@ The idea behind Training Rules is to consider "blocks" of record pairs with a mi There are some tools within Splink to help choosing these rules. For example, the `count_num_comparisons_from_blocking_rule` gives the number of records pairs generated by a blocking rule: ```py - -linker.count_num_comparisons_from_blocking_rule("l.first_name = r.first_name AND l.surname = r.surname") - +blocking_rule = brl.and_( + brl.exact_match_rule("first_name"), + brl.exact_match_rule("surname") + ) +linker.count_num_comparisons_from_blocking_rule(blocking_rule) ``` +> 1056 It is recommended that you run this function to check how many comparisons are generated before training a model so that you do not needlessly run a training session on billions of comparisons. diff --git a/docs/topic_guides/blocking/performance.md b/docs/topic_guides/blocking/performance.md index 2d906a8765..968097209f 100644 --- a/docs/topic_guides/blocking/performance.md +++ b/docs/topic_guides/blocking/performance.md @@ -54,7 +54,9 @@ Equi-joins are simply equality conditions between records, e.g. These equality-based blocking rules are extremely efficient and can be executed quickly, even on very large datasets. -Equality-based blocking rules should be considered the default method for defining blocking rules and form the basis of the upcoming [Blocking Rules Library](https://github.com/moj-analytical-services/splink/pull/1370). +Equality-based blocking rules should be considered the default method for defining blocking rules and form the basis of the [Blocking Rules Library](../../blocking_rule_library.md). For example, the above example can be written as: + +`brl.exact_match_rule("first_name")` ### Filter Conditions diff --git a/docs/topic_guides/blocking/predictions.md b/docs/topic_guides/blocking/predictions.md index cc6f429d67..578d391527 100644 --- a/docs/topic_guides/blocking/predictions.md +++ b/docs/topic_guides/blocking/predictions.md @@ -12,11 +12,14 @@ The aim of Prediction Blocking Rules are to: Blocking Rules for Prediction are defined through `blocking_rules_to_generate_predictions` in the Settings dictionary of a model. For example: -``` py hl_lines="3-5" +``` py hl_lines="3-8" settings = { "link_type": "dedupe_only", "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and l.surname = r.surname" + brl.and_( + brl.exact_match_rule("first_name"), + brl.exact_match_rule("surname") + ) ], "comparisons": [ ctl.name_comparison("first_name"), @@ -41,8 +44,11 @@ This is why `blocking_rules_to_generate_predictions` is a list. Suppose we also ```python settings_example = { "blocking_rules_to_generate_predictions" [ - "l.first_name = r.first_name and l.surname = r.surname", - "l.postcode = r.postcode" + brl.and_( + brl.exact_match_rule("first_name"), + brl.exact_match_rule("surname") + ), + brl.exact_match_rule("postcode") ] } ``` @@ -65,8 +71,8 @@ Once a linker has been instatiated, we can use the `cumulative_num_comparisons_f ```py settings = { "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name", - "l.surname = r.surname", + brl.exact_match_rule("first_name"), + brl.exact_match_rule("surname") ], } ``` diff --git a/mkdocs.yml b/mkdocs.yml index d20150fc8e..b85c977a75 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -129,8 +129,8 @@ nav: - Feature Engineering: "topic_guides/data_preparation/feature_engineering.md" - Blocking: - What are Blocking Rules?: "topic_guides/blocking/blocking_rules.md" + - Prediction Blocking Rules: "topic_guides/blocking/predictions.md" - Model Training Blocking Rules: "topic_guides/blocking/model_training.md" - - Prediction Blocking Rules: "topic_guides/blocking/predictions.md" - Computational Performance: "topic_guides/blocking/performance.md" - Comparing Records: - Defining and customising comparisons: "topic_guides/comparisons/customising_comparisons.ipynb" From b4beaf1351c7e982b8e806c03fa284c1f528bb5a Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Mon, 17 Jul 2023 15:26:09 +0100 Subject: [PATCH 23/23] lint --- tests/test_find_new_matches.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_find_new_matches.py b/tests/test_find_new_matches.py index 95d4654606..a59537b842 100644 --- a/tests/test_find_new_matches.py +++ b/tests/test_find_new_matches.py @@ -3,7 +3,7 @@ import pandas as pd from splink.duckdb.comparison_library import exact_match -from splink.duckdb.linker import DuckDBLinker + from .basic_settings import get_settings_dict from .decorator import mark_with_dialects_excluding