From f49f92c4c29cc2a8440c4ffccc2fb248cdd30694 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 9 Jul 2021 15:38:27 +1000 Subject: [PATCH] More robust prevention of XML Declaration recursion --- src/main/java/org/jsoup/nodes/Comment.java | 16 ++++++++-------- .../org/jsoup/integration/FuzzFixesTest.java | 11 +++++++++++ src/test/resources/fuzztests/1569.html | Bin 0 -> 317288 bytes 3 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 src/test/resources/fuzztests/1569.html diff --git a/src/main/java/org/jsoup/nodes/Comment.java b/src/main/java/org/jsoup/nodes/Comment.java index c3d0e73f21..33f5c1915c 100644 --- a/src/main/java/org/jsoup/nodes/Comment.java +++ b/src/main/java/org/jsoup/nodes/Comment.java @@ -1,11 +1,10 @@ package org.jsoup.nodes; -import org.jsoup.Jsoup; +import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import javax.annotation.Nullable; import java.io.IOException; -import java.util.regex.Pattern; /** A comment node. @@ -67,9 +66,8 @@ public boolean isXmlDeclaration() { return isXmlDeclarationData(data); } - private static final Pattern xmlDeclPattern = Pattern.compile("^[!?]xml.*", Pattern.CASE_INSENSITIVE); private static boolean isXmlDeclarationData(String data) { - return data.length() > 4 && xmlDeclPattern.matcher(data).matches(); + return (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))); } /** @@ -81,13 +79,15 @@ private static boolean isXmlDeclarationData(String data) { XmlDeclaration decl = null; String declContent = data.substring(1, data.length() - 1); - // make sure this bogus comment is not packed with recursive xml decls; null out if so + // make sure this bogus comment is not immediately followed by another, treat as comment if so if (isXmlDeclarationData(declContent)) return null; - Document doc = Jsoup.parse("<" + declContent + ">", baseUri(), Parser.xmlParser()); - if (doc.children().size() > 0) { - Element el = doc.child(0); + String fragment = "<" + declContent + ">"; + // use the HTML parser not XML, so we don't get into a recursive XML Declaration on contrived data + Document doc = Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(fragment, baseUri()); + if (doc.body().children().size() > 0) { + Element el = doc.body().child(0); decl = new XmlDeclaration(NodeUtils.parser(doc).settings().normalizeTag(el.tagName()), data.startsWith("!")); decl.attributes().addAll(el.attributes()); } diff --git a/src/test/java/org/jsoup/integration/FuzzFixesTest.java b/src/test/java/org/jsoup/integration/FuzzFixesTest.java index 5aeb7c9f9a..cff3333084 100644 --- a/src/test/java/org/jsoup/integration/FuzzFixesTest.java +++ b/src/test/java/org/jsoup/integration/FuzzFixesTest.java @@ -43,6 +43,17 @@ public void xmlDeclOverflow() throws IOException { assertNotNull(docXml); } + @Test + public void xmlDeclOverflowOOM() throws IOException { + // https://github.com/jhy/jsoup/issues/1569 + File in = ParseTest.getFile("/fuzztests/1569.html"); + Document doc = Jsoup.parse(in, "UTF-8"); + assertNotNull(doc); + + Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser()); + assertNotNull(docXml); + } + @Test public void stackOverflowState14() throws IOException { // https://github.com/jhy/jsoup/issues/1543 diff --git a/src/test/resources/fuzztests/1569.html b/src/test/resources/fuzztests/1569.html new file mode 100644 index 0000000000000000000000000000000000000000..0efe6370ce1acf628cd7bff84ecd0392e8eb99ff GIT binary patch literal 317288 zcmeI*J8vCFwgq4^zdvFiKn$Gpm?1@hFrX1c1!N!t4jc$HK!JgsSjmkX`1csF0_Jw^ zK>ja=W8JFi$EoV7?(-mpxduK49&vE^IK}SOd#}CL+9xL`-`srnpPTcG|NY_ni|+J3 z`?%>&yNj;7?QXi`?%&<`lAG>R_`h$@@6IoteEB5)GrS=D=lRmB(s%x{y9wVf{;>CF z=XZZxj6bLSLq2}k{E+?^OJDx4yB}}gJuiN0>vxN<3cq{$fzwx9U0xUO;9KW@8h`pDZHhdh4=h<-fj5#(~%zdZo@m+9M1f$e%pAfPtS%odKzAR)xYcR z_}Bc;9RK>a;~)Obs~2m>b$WGuIeeG!tAua4`EB|Or1h=CWfOkz{~R41r7zj%&$I2f zzKPe_qkQ}R?Od&%(L>)zH~cN0?Q?KH|Mltb<%9d}m*4+9{VLtT{dN7t@H0MsSovcg za%}Z=G#uFpY=JGS_sxJdux0I*d?1R!mZ7xC99dF+yPaZx<%cRie+`nrhtbn zHQvG&JyG6KZ|S{LbEj^p?<3e!p9yS%EwCk$c=#4vE-trct$Ri`H76NWjLXf&C9NV1 zm)iv--DePy9cUtZ`$3L5(E=2k6Fp?$r6;QEe?3usCVC=0k)B9TMBg%0{44WSGw9SH zta!YE7E;qu%K5QEXz~$t!`1b9P<+mhDct;@VK1Y{ko#6pC;o8NsE`y8vxh>TVTs* zwuUVQSWI5?tUQG+Ywqns480kXk8i=ZtUSqOEKKaax<%chZc(=+>0OVDdZOicovg5E zrp~BaC=Tw58NKdpH-341Z*-#;!?f?}$Y=JGXrGMf!vH%k^1Y2Os_5@Ga zP_m(9L&Zp4xpR-54xAq;zNP2fS}LvfAP3)4Of-grlb3ssZ<)b0Y=JFB-uz;Sxt>T* zq$f(x)g{_M&j0n5{FXYaM1D(sCI>LrdLn!azGYUx!xq@muXB<+CwFdc@06Fj5YpIE z*WaGYv|KnY*aBN%%VtsFs(=Ev$cD<3N#xGO61_FO%TKA#FAUl&nU*pwhs!{wWzwm| z6|#x=$Yj663BeZF67%@ww_F#J2VTX96~&RX9>8t+3~b3t_|~y4!ZE&Ofk4O(G}8;> zTNaKB--2(!x8Pe6K+3d~X*tBKr)TG|{_W!2%pTQun{4G(EUI^2#f$slBV380|4$RZ zDTDE%liw1~Wqy0k4{5Lp$l`8Ul@1^d+R$ZAl#MmaiSRA>7TEIppQm39`p1Zxc8J&^ z@7~6TueP(l^1ajxUah<+sRh*^Z<#7Gx|0 z(($Hy7UaWomTi#)hb^!LwxpVU>G5k$BpYg!p5zZ%GiWntGw2AN`$<{n6DhfK^?d|e z>NAl$ml~X%OM7*Tx<%d6$%cA;$*cGonmct%{ft(()Muh@QMZI9X%d3AQSMQXnN+Z4BzE^0rQ%!gE%+9E%UgU) zTLo-R)JNnNnV@p#7Rbr0W6pY>Se(R(qn>Cuh|R^s2fcfo621lBQq^j(WdVw1TJ96o z23ueYY`N_QQt8dB7waLA>K1iNk>GQsCt7$A>xl+`K5R*zYdb`u+@dv5m-FB4xkUoD zz?S^3fi1AbexfH8jLPDc#Vs3ZBsk=^$ZwI~BERKxOl4sUY=JGX1-5jRX#dL;O1W#w zUiIZNEt{<2e01B3YuaiydAag(+kKo4$p^{$>sPoyUb6xq|W`gzlwxTsrTi@IgXLa?*-x>&VGmrIvRmrIu`({iV>GSw0* zEYS^p3w;ZH%T#V{A~9*_Xy={`SwZ$Hh7SHQ)2V6aXy<6>l5z~Tz!q8D#gxGAvxx-X z4wCj0)w_7?C#uiHej@vc>?ew2d)hhLxk<`l23_b%i%;lVWJAe@k_{Dk3UeZJB6A{h zqNAhKMirWzEwX7d=r46; z`j*pi&vzkdDL$cZp>LsYp>LsYdHdQ7`u^Sw+6?+on!=pOoXDK$e)wo3sr^K>bD=A{ z>F8U!*mtC`5nL`VcTC;2LquL~F)8F##N|dxf2F&aq5@vLT%o@0C+dEJEwE*$&_w%* z>?g|8%2)AM&VOk3H_xcy6B!F;(CQX5=;1+fha_MsEa_MrT zpscMXld%xaWuFePczIy#^+eyjKkK@{Egn7@E{I8Wi@HVKqHa;Qs9V%63$k8&7422D zS8+Un@Tjm?akF%Fzmz?GB7P!%q7Y7*-w6Cf3xx-4nUh6}t4AM9h?o-<CNSChdO*s@&~0$c1p zv-^xLcTu{|Pjr1nJ6G$(Xy@uPp`D|h%cqwuNyxmpgDv%a1Y7Defi18Fw$SC$phq~lz8q9Cuw`#jn!csJk6=rECiE@zE%Yt)E%Yt)E%YsE zLDnGgp>N4H#kO=PcR${~3u2%Y>Ja~(Hc9+mQoo!9I>3v(u& zSA8ZrFP)dpOXsEY(s}8;bY8KKnvYM{GO+(aomW2KqyDvZ+SX}3(R@P+70o#I-a}|c zMMFhHMbmLazFt=P<8pDig^udV_Qr+5M{CLjTVRU>syRt1$w87si3?B9>T9BT-$=Jh zsS&FQfV~Y|-J)*Ui7ZsN4537@1-4Afms`mWMctC!1p{jSW>bkxB{O=Q4|*p$%!akjrV^VlJj3YEmMADQ9;-YE7Ni`0LZkQJf*uRXI&}Ps(HS}iCX3+US)ePDUI%?eH<(A>F@^a{X~s5w%L;<8*$6D%pS9Jxq2c!k=(i8|2+L_l-q`2 zVm(nRDK5H?AG+QgI1Y7BL)>0>N|#HQJIH6g_y+oAq_Sp`-$u)%9sKBpf5&GBy5>zqRpCs{ik9D6zPf7 zE%ICBxBOsEbaZsY2h9gPyAg_s&0}uW&=VDfh@L3idZ{feMCFfcj4^{gsNAFZw`;y! z?f3Moz9yPVk$;bRlNT>n)*_r<$ykuFAY);?SuWRZmW&2m3Ec@=jm zE3?9)$w1TP@RH1j$5kZkjG*_k^MyW6ZKw&rCa2u!zobumSxc) zT`pZNT`p|7>ISXRv%nh9^Uu^R;o0<4{ba&0)Z;Zixf# zOrsj2Y&VRPx+O|>4@6SBT)JGkT-cHx!jFukum!fj7T5w?x&Vczu}X_^RU2P2&Hv$~ zw4O*$bUxlvkIP)EThuM;7IlldMcp#0$7cncx`kJ9C*h}z1^b}$gd1K(UPWF-Ud32j z%K4Y`-}8d0(>U&ij59k#!o9KIhT7ugvbbRj%|FfmTr$L~xKqnYmrIv>pqh;4zbHgx zagPJK&CJ`b`M*B>&8rxn&0nT6|Kyw42cO^lVLwsU0h&RZL7PFFL7PFFL7PFpN#37c zL77j}in8+LB$g8Yxj7$%apStf_`hwMHGjzR6V-acM40SXrR7J8Z^5_7hT>J6##$9P zz?Pk3NMu9tL1%?v)Q}|4uewFu@>608>^|!r0FwMf^?d|e>NC+3C1O_HGA*Rs9&!a+ z>iY<`)Mo-)U`ud_Ph-G0uV710BI0rf+B>^CvOKI&X2Az7V<8)r^kEacioA+D4I*F* zuVOx3WI0Xt@colgS$MM^C8$~~^li#w14wJDUV?oBk4zBOYGH?o8 zV9TI)u(LJLXLh#QeKr~Z_E>BD0I9s5NKd3E(i1H{eqjr*BClfbXVT^7$sL&-h*yzU zaY_OVtYV04zLMXvaI<7nJ(~Zd$b9i~b!ED}iiZ!pcol1395O*9p*)8ki96sKU#Gvb3DC>GcPTVTr$ zsVc>d!mG%u$g9Y!$g9Y!7*!u_6+EwEIF~scn%0+0p0sItb@$`#yI@w!UOw-Vzim9E z^+fAqulYb#-J)($x2RjxE$S9^%fbkopQxE|R%rF6!lKQt#{HdvczuYX4c`(aaIFA` z%Ul#z&jbcymv$~crtIQ1gEoWC?;1Uko=8umC(;w?iS$HzqK7vvdZJd_B6rT5NbcO> zHk_Uifi1A*CQv+avHklU(vr-ei=pzc45+w<%%F#DIIyL-E(Te?x<%chZc(>f@j=(* z03URHCVbF#wl;G2i?MAsk4#A?u%*6_U`u@_um!fj7THiMVMp{W^eyx)v~xoYK*?_Q z+SZfkws{`wt6h#Mt5snOY=JGi+%)+q6;fupT)NzK^}PI+M2XPl#`a$}RB{HZTPDji zAM{Sfn5=3|a8f*kVaw;mmegDHT8qV%5I_I*>2F(8Y)y%p>-Z6LIx(%SkYG&B>t>-7 zcV@FDXu};vPjNX>^j!4!+S1)dEl4eRxnraZY=JG5$V*6SOKlp|w{DSDAq#MQo|HrPh{OTWa6FUhYwBl04iB#^q2e z`neqHGvRXJa^P|Z&7BR{OAlg7WJ=^*V@uKVc@;hBk(?8c>K1iNT(1}g=2vr2P2dk!3+S0vE;*?o94X?#yV! zh|0aV!D(pmrb8JDe0&SO#nzNMKttW4Zc(?4A{u_8qREjyzEW856Sd3|(#Mw^Hn0V@ z@GACx`CoH7G3(xBEZm*x@UR88z!uojvtay0p>a7s3`ujzP_H5CqG^0H_|X&T ziM~%cKkAl7+>yFPHq=nU1}M#L5ne@J#gS&sAF_10bh+cumM)hrmoB$Q$yShZrf{Ik zrOVC5F7z$*E%Yt)Es+6?o7R&S?(k}&5YoXG*aBN%3v7A&n!bg;CFXtR6qY&BPAzM; z)`WJBc1{-e5|Y9edAZ5PBrms%eMi!Am=l>3nG=mJ$HSQ&nU?i^1Y7Dek!dN@Ql_PB zsIO&1^$}q!ksdZ+CvL}_$ehTW$ehTW$ed_d1<$LBi|NT7HR{=?jS8r+P&56v3%!zinY4TH|Z%KP{ zV9Un*LpwKYmn-g#;uAAyGiWpDx-?e*g3ItFhGyTD>m@UgYD4o}+%WW5c(YLBGDtsU=+Q%N2D~23>lhOpeh) zeAoh8V9S=9q)$qav9RYFZP)@^U<+&sW&}Oaht2ZF-ru&jGuWKSoG9kV9iG@kJGaw# zP2HkyQMagD)GZgGa8tLO?xW@jjN|6zb1?^f%T5nqd`oB!zI%U$Z;4WXw5Z!0MKRjt$>}Z^5@@HEv6&0b5{8k#?6| mFM1+9QK;YgY@Y*&1K0vvU`t^YNJ&_FqFEF#Z27Ni{{Ig!=W~Ss literal 0 HcmV?d00001