diff --git a/CHANGES.md b/CHANGES.md index c2280ef6c6..39b789d9f8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,7 +21,16 @@ * `Element.cssSelector()` would fail if the element's class contained a `*` character. [2169](https://github.com/jhy/jsoup/issues/2169) * When tracking source ranges, a text node following an invalid self-closing element may be left - untracked.[2175](https://github.com/jhy/jsoup/issues/2175) + untracked. [2175](https://github.com/jhy/jsoup/issues/2175) +* When a document has no doctype, or a doctype not named `html`, it should be parsed in Quirks + Mode. [2197](https://github.com/jhy/jsoup/issues/2197) +* With a selector like `div:has(span + a)`, the `has()` component was not working correctly, as the inner combining + query caused the evaluator to match those against the outer's siblings, not + children. [2187](https://github.com/jhy/jsoup/issues/2187) +* A selector query that included multiple `:has()` components in a nested `:has()` might incorrectly + execute. [2131](https://github.com/jhy/jsoup/issues/2131) +* Updated the simple view of cookies available via `Connection.Response#cookies()` to reflect the contents of the + current cookie jar for the current URL. [1831](https://github.com/jhy/jsoup/issues/1831) ## 1.18.1 (2024-Jul-10) diff --git a/pom.xml b/pom.xml index 02b4207e16..201f441bd3 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ UTF-8 - 9.4.55.v20240627 + 9.4.56.v20240826 @@ -118,7 +118,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.8.0 + 3.10.0 none 8 @@ -203,7 +203,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.3.1 + 3.5.0 -Xss640k @@ -211,7 +211,7 @@ maven-failsafe-plugin - 3.3.1 + 3.5.0 @@ -229,7 +229,7 @@ com.github.siom79.japicmp japicmp-maven-plugin - 0.22.0 + 0.23.0 @@ -372,7 +372,7 @@ maven-failsafe-plugin - 3.3.1 + 3.5.0 @@ -393,7 +393,7 @@ org.junit.jupiter junit-jupiter - 5.10.3 + 5.11.0 test diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 141fe5b6da..55a5dbacc8 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -604,10 +604,6 @@ interface Base> { /** * Get a cookie value by name from this request/response. - *

- * Response objects have a simplified cookie model. Each cookie set in the response is added to the response - * object's cookie key=value map. The cookie's path, domain, and expiry date are ignored. - *

* @param name name of cookie to retrieve. * @return value of cookie, or null if not set */ @@ -638,6 +634,7 @@ interface Base> { /** * Retrieve all of the request/response cookies as a map * @return cookies + * @see #cookieStore() */ Map cookies(); } diff --git a/src/main/java/org/jsoup/helper/CookieUtil.java b/src/main/java/org/jsoup/helper/CookieUtil.java index f375003753..218e935efd 100644 --- a/src/main/java/org/jsoup/helper/CookieUtil.java +++ b/src/main/java/org/jsoup/helper/CookieUtil.java @@ -4,6 +4,8 @@ import org.jsoup.internal.StringUtil; import java.io.IOException; +import java.net.CookieManager; +import java.net.HttpCookie; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URI; @@ -83,8 +85,21 @@ static URI asUri(URL url) throws IOException { } } - static void storeCookies(HttpConnection.Request req, URL url, Map> resHeaders) throws IOException { - req.cookieManager().put(CookieUtil.asUri(url), resHeaders); // stores cookies for session + /** Store the Result cookies into the cookie manager, and place relevant cookies into the Response object. */ + static void storeCookies(HttpConnection.Request req, HttpConnection.Response res, URL url, Map> resHeaders) throws IOException { + CookieManager manager = req.cookieManager(); + URI uri = CookieUtil.asUri(url); + manager.put(uri, resHeaders); // stores cookies for session + // set up the simple cookie(name, value) map: + Map> cookieMap = manager.get(uri, resHeaders); // get cookies for url; may have been set on this or earlier requests. the headers here are ignored other than a null check + for (List values : cookieMap.values()) { + for (String headerVal : values) { + List cookies = HttpCookie.parse(headerVal); + for (HttpCookie cookie : cookies) { + res.cookie(cookie.getName(), cookie.getValue()); + } + } + } } } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index 425d77d0cf..712019108b 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -1133,14 +1133,9 @@ private Response(HttpURLConnection conn, HttpConnection.Request request, HttpCon Map> resHeaders = createHeaderMap(conn); processResponseHeaders(resHeaders); // includes cookie key/val read during header scan - CookieUtil.storeCookies(req, url, resHeaders); // add set cookies to cookie store + CookieUtil.storeCookies(req, this, url, resHeaders); // add set cookies to cookie store if (previousResponse != null) { // was redirected - // map previous response cookies into this response cookies() object - for (Map.Entry prevCookie : previousResponse.cookies().entrySet()) { - if (!hasCookie(prevCookie.getKey())) - cookie(prevCookie.getKey(), prevCookie.getValue()); - } previousResponse.safeClose(); // enforce too many redirects: @@ -1176,19 +1171,6 @@ void processResponseHeaders(Map> resHeaders) { continue; // http/1.1 line List values = entry.getValue(); - if (name.equalsIgnoreCase("Set-Cookie")) { - for (String value : values) { - if (value == null) - continue; - TokenQueue cd = new TokenQueue(value); - String cookieName = cd.chompTo("=").trim(); - String cookieVal = cd.consumeTo(";").trim(); - // ignores path, date, domain, validateTLSCertificates et al. full details will be available in cookiestore if required - // name not blank, value not null - if (cookieName.length() > 0 && !cookies.containsKey(cookieName)) // if duplicates, only keep the first - cookie(cookieName, cookieVal); - } - } for (String value : values) { addHeader(name, fixHeaderEncoding(value)); } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index f1b2d7b239..470a785a50 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -25,18 +25,19 @@ enum HtmlTreeBuilderState { tb.insertCommentNode(t.asComment()); } else if (t.isDoctype()) { // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids Token.Doctype d = t.asDoctype(); DocumentType doctype = new DocumentType( tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); doctype.setPubSysKey(d.getPubSysKey()); tb.getDocument().appendChild(doctype); tb.onNodeInserted(doctype); - if (d.isForceQuirks()) + // todo: quirk state check on more doctype ids, if deemed useful (most are ancient legacy and presumably irrelevant) + if (d.isForceQuirks() || !doctype.name().equals("html") || doctype.publicId().equalsIgnoreCase("HTML")) tb.getDocument().quirksMode(Document.QuirksMode.quirks); tb.transition(BeforeHtml); } else { // todo: check not iframe srcdoc + tb.getDocument().quirksMode(Document.QuirksMode.quirks); // missing doctype tb.transition(BeforeHtml); return tb.process(t); // re-process token } diff --git a/src/main/java/org/jsoup/select/StructuralEvaluator.java b/src/main/java/org/jsoup/select/StructuralEvaluator.java index 1e01ed03f1..c64eab3ac0 100644 --- a/src/main/java/org/jsoup/select/StructuralEvaluator.java +++ b/src/main/java/org/jsoup/select/StructuralEvaluator.java @@ -1,6 +1,7 @@ package org.jsoup.select; import org.jsoup.internal.Functions; +import org.jsoup.internal.SoftPool; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.NodeIterator; @@ -51,8 +52,8 @@ public boolean matches(Element root, Element element) { } static class Has extends StructuralEvaluator { - static final ThreadLocal> ThreadElementIter = - ThreadLocal.withInitial(() -> new NodeIterator<>(new Element("html"), Element.class)); + static final SoftPool> ElementIterPool = + new SoftPool<>(() -> new NodeIterator<>(new Element("html"), Element.class)); // the element here is just a placeholder so this can be final - gets set in restart() private final boolean checkSiblings; // evaluating against siblings (or children) @@ -69,16 +70,20 @@ public Has(Evaluator evaluator) { return true; } } - } else { - // otherwise we only want to match children (or below), and not the input element. And we want to minimize GCs so reusing the Iterator obj - NodeIterator it = ThreadElementIter.get(); - it.restart(element); + } + // otherwise we only want to match children (or below), and not the input element. And we want to minimize GCs so reusing the Iterator obj + NodeIterator it = ElementIterPool.borrow(); + it.restart(element); + try { while (it.hasNext()) { Element el = it.next(); if (el == element) continue; // don't match self, only descendants - if (evaluator.matches(element, el)) + if (evaluator.matches(element, el)) { return true; + } } + } finally { + ElementIterPool.release(it); } return false; } diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java index 833fead486..7162e7f05b 100644 --- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java +++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java @@ -155,27 +155,6 @@ public void caseInsensitiveHeaders(Locale locale) { assertEquals(0, res.cookies().size()); } - @Test public void ignoresEmptyCookieNameAndVals() { - // prep http response header map - Map> headers = new HashMap<>(); - List cookieStrings = new ArrayList<>(); - cookieStrings.add(null); - cookieStrings.add(""); - cookieStrings.add("one"); - cookieStrings.add("two="); - cookieStrings.add("three=;"); - cookieStrings.add("four=data; Domain=.example.com; Path=/"); - - headers.put("Set-Cookie", cookieStrings); - HttpConnection.Response res = new HttpConnection.Response(); - res.processResponseHeaders(headers); - assertEquals(4, res.cookies().size()); - assertEquals("", res.cookie("one")); - assertEquals("", res.cookie("two")); - assertEquals("", res.cookie("three")); - assertEquals("data", res.cookie("four")); - } - @Test public void connectWithUrl() throws MalformedURLException { Connection con = HttpConnection.connect(new URL("http://example.com")); assertEquals("http://example.com", con.request().url().toExternalForm()); diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 7e1de61547..f9c39a4d54 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -387,4 +387,33 @@ public void canOutputHtmlWithoutNamespace() { assertEquals("Foo", doc.getFirstChild().getTextContent()); } + @Test void testHtmlParseAttributesAreCaseInsensitive() throws IOException { + // https://github.com/jhy/jsoup/issues/981 + String html = "\n" + + "\n" + + "\"Alt\n" + + "\"Alt\n" + + "\n" + + ""; + org.jsoup.nodes.Document jsoupDoc; + jsoupDoc = Jsoup.parse(html); + org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); + Document doc = jDom.fromJsoup(jsoupDoc); + org.w3c.dom.Element body = (org.w3c.dom.Element) doc.getDocumentElement().getElementsByTagName("body").item(0); + NodeList imgs = body.getElementsByTagName("img"); + assertEquals(2, imgs.getLength()); + org.w3c.dom.Element first = (org.w3c.dom.Element) imgs.item(0); + assertEquals(first.getAttributes().getLength(), 2); + String img1 = first.getAttribute("src"); + assertEquals("firstImage.jpg", img1); + String alt1 = first.getAttribute("alt"); + assertEquals("Alt one", alt1); + org.w3c.dom.Element second = (org.w3c.dom.Element) imgs.item(1); + assertEquals(second.getAttributes().getLength(), 2); + String img2 = second.getAttribute("src"); + assertEquals("secondImage.jpg", img2); + String alt2 = second.getAttribute("alt"); + assertEquals("Alt two", alt2); + } + } diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 3012828085..6042f81c64 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -19,6 +19,7 @@ import org.jsoup.parser.StreamParser; import org.jsoup.parser.XmlTreeBuilder; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; @@ -60,6 +61,12 @@ public static void setUp() { echoUrl = EchoServlet.Url; } + @BeforeEach + public void emptyCookieJar() { + // empty the cookie jar, so cookie tests are independent. + Jsoup.connect("http://example.com").cookieStore().removeAll(); + } + @Test public void canConnectToLocalServer() throws IOException { String url = HelloServlet.Url; @@ -427,7 +434,7 @@ public void multiCookieSet() throws IOException { // test cookies set by redirect: Map cookies = res.cookies(); assertEquals("asdfg123", cookies.get("token")); - assertEquals("jhy", cookies.get("uid")); + assertEquals("jhy", cookies.get("uid")); // two uids set, order dependent // send those cookies into the echo URL by map: Document doc = Jsoup.connect(echoUrl).cookies(cookies).get(); diff --git a/src/test/java/org/jsoup/integration/servlets/RedirectServlet.java b/src/test/java/org/jsoup/integration/servlets/RedirectServlet.java index 0a937b772f..41243c36aa 100644 --- a/src/test/java/org/jsoup/integration/servlets/RedirectServlet.java +++ b/src/test/java/org/jsoup/integration/servlets/RedirectServlet.java @@ -33,7 +33,8 @@ protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOEx if (req.getParameter(SetCookiesParam) != null) { res.addCookie(new Cookie("token", "asdfg123")); - res.addCookie(new Cookie("uid", "jhy")); + res.addCookie(new Cookie("uid", "foobar")); + res.addCookie(new Cookie("uid", "jhy")); // dupe, should use latter } res.setHeader("Location", location); diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index be566d3654..2c2fd50d9b 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -26,6 +26,7 @@ import java.util.regex.Pattern; import java.util.stream.Stream; +import static org.jsoup.select.SelectorTest.assertSelectedOwnText; import static org.junit.jupiter.api.Assertions.*; /** @@ -2636,7 +2637,7 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) { assertEquals(selected.first(), div); } - @Test void cssSelectorWithAstrix() { + @Test void cssSelectorWithAsterisk() { // https://github.com/jhy/jsoup/issues/2169 Document doc = Jsoup.parse("
One
Two
"); Element div = doc.expectFirst("div"); @@ -2648,6 +2649,16 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) { assertEquals(selected.first(), div); } + @Test void cssSelectorWithPipe() { + // https://github.com/jhy/jsoup/issues/1998 + Document doc = Jsoup.parse("
One
"); + Element span = doc.expectFirst("div span"); + String selector = span.cssSelector(); + assertEquals("html > body > div > span.\\|", selector); + Elements selected = doc.select(selector); + assertSelectedOwnText(selected, "One"); + } + @Test void orphanSiblings() { Element el = new Element("div"); assertEquals(0, el.siblingElements().size()); diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 7fa7a67a59..a67003a839 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1888,4 +1888,24 @@ private static void assertMathNamespace(Element el) { img.ownerDocument().outputSettings().charset("ascii"); assertEquals("", img.outerHtml()); } + + @Test void tableInPInQuirksMode() { + // https://github.com/jhy/jsoup/issues/2197 + String html = "

Hello table data

"; + Document doc = Jsoup.parse(html); + assertEquals(Document.QuirksMode.quirks, doc.quirksMode()); + assertEquals( + "

Hello table data

", // quirks, allows table in p + TextUtil.normalizeSpaces(doc.body().html()) + ); + + // doctype set, no quirks + html ="

Hello table data

"; + doc = Jsoup.parse(html); + assertEquals(Document.QuirksMode.noQuirks, doc.quirksMode()); + assertEquals( + "

Hello table data

", // no quirks, p gets closed + TextUtil.normalizeSpaces(doc.body().html()) + ); + } } diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index d296e1a2b7..0ae4048e3f 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -28,7 +28,7 @@ public static void assertSelectedIds(Elements els, String... ids) { } } - static void assertSelectedOwnText(Elements els, String... ownTexts) { + public static void assertSelectedOwnText(Elements els, String... ownTexts) { assertNotNull(els); assertEquals(ownTexts.length, els.size(), "Incorrect number of selected elements"); for (int i = 0; i < ownTexts.length; i++) { @@ -1296,4 +1296,51 @@ public void emptyPseudo() { Elements emptyAttr = doc.select("p:not([*])"); assertSelectedOwnText(emptyAttr, "Three"); } + + @Test void divHasSpanPreceding() { + // https://github.com/jhy/jsoup/issues/2187 + String html = "
abcdef
"; + String q = "div:has(span + a)"; + + Document doc = Jsoup.parse(html); + Elements els = doc.select(q); + assertEquals(1, els.size()); + assertEquals("div", els.first().normalName()); + } + + @Test void divHasDivPreceding() { + // https://github.com/jhy/jsoup/issues/2131 + String html = "
\n" + + "
hello
\n" + + "
there
\n" + + "\n" + + "
"; + + String q = "div:has(>div + div)"; + + Document doc = Jsoup.parse(html); + Elements els = doc.select(q); + assertEquals(1, els.size()); + assertEquals("div", els.first().normalName()); + assertEquals("1", els.first().id()); + } + + @Test void nestedMultiHas() { + // https://github.com/jhy/jsoup/issues/2131 + String html = + "" + + "" + + "" + + "
" + + "
hello
" + + "
world
" + + "
" + + ""; + Document document = Jsoup.parse(html); + + String q = "div:has(> div:has(> span) + div:has(> span))"; + Elements els = document.select(q); + assertEquals(1, els.size()); + assertEquals("o", els.get(0).id()); + } }