From daef8bb412db6c94b9895837ae6e28c5770c65ad Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 29 Nov 2023 11:09:09 +1100 Subject: [PATCH] Initialise current token to the virtual start token Ensures that in body fragment parsing, which adds the context element to the stack before there is a real token, has a current token during the track position for that first stack insert. Fixes #2068 --- CHANGES.md | 2 ++ src/main/java/org/jsoup/parser/TreeBuilder.java | 4 ++-- .../java/org/jsoup/parser/PositionTest.java | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2856d63e73..ec8ff4cf2e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,6 +7,8 @@ * When tracking the source position of attributes, if source attribute name was mix-cased but the parser was lower-case normalizing attribute names, the source position for that attribute was not tracked correctly. [2067](https://github.com/jhy/jsoup/issues/2067) +* When tracking the source position of a body fragment parse, a null pointer exception was + thrown. [2068](https://github.com/jhy/jsoup/issues/2068) --- Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 81f1fb57e0..7cd06f5e7d 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -26,7 +26,7 @@ abstract class TreeBuilder { Document doc; // current doc we are building into ArrayList stack; // the stack of open elements String baseUri; // current base uri, for creating new elements - Token currentToken; // currentToken is used only for error tracking. + Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse ParseSettings settings; Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. @@ -48,11 +48,11 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { reader = new CharacterReader(input); trackSourceRange = parser.isTrackPosition(); reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility - currentToken = null; tokeniser = new Tokeniser(this); stack = new ArrayList<>(32); seenTags = new HashMap<>(); start = new Token.StartTag(this); + currentToken = start; // init current token to the virtual start token. this.baseUri = baseUri; } diff --git a/src/test/java/org/jsoup/parser/PositionTest.java b/src/test/java/org/jsoup/parser/PositionTest.java index e4662c6546..35f1494b55 100644 --- a/src/test/java/org/jsoup/parser/PositionTest.java +++ b/src/test/java/org/jsoup/parser/PositionTest.java @@ -470,6 +470,23 @@ private void printRange(Node node) { assertEquals("id:3-5=6-7; ", xmlLcPos .toString()); } + @Test void tracksFrag() { + // https://github.com/jhy/jsoup/issues/2068 + String html = "

One

\n

Two

Ten"; + Document shellDoc = Document.createShell(""); + + List nodes = TrackingHtmlParser.parseFragmentInput(html, shellDoc.body(), shellDoc.baseUri()); + StringBuilder track = new StringBuilder(); + + // nodes is the top level nodes - want to descend to check all tracked OK + nodes.forEach(node -> node.nodeStream().forEach(descend -> { + accumulatePositions(descend, track); + accumulateAttributePositions(descend, track); + })); + + assertEquals("h1:0-9~12-17; id:4-6=7-8; #text:9-12; #text:17-18; h2:18-27~30-35; id:22-24=25-26; #text:27-30; h10:35-40~43-49; #text:40-43; ", track.toString()); + } + static void accumulateAttributePositions(Node node, StringBuilder sb) { if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked for (Attribute attribute : node.attributes()) {