Skip to content

Commit

Permalink
Clone Token.Characters into PendingTableCharacters
Browse files Browse the repository at this point in the history
Keeps the source start/end tracking.

Fixes #1927
  • Loading branch information
jhy committed Mar 29, 2023
1 parent dea4969 commit c93ea51
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 18 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ Release 1.16.1 [PENDING]
again, causing errors when fetched.
<https://github.com/jhy/jsoup/issues/1902>

* Bugfix: when tracking input source positions, text in tables that was fostered had invalid positions.
<https://github.com/jhy/jsoup/issues/1927>

* Bugfix: If the Document.OutputSettings class was initialized, and then Entities.escape(String) called, an NPE may be
thrown due to a class loading circular dependency.
<https://github.com/jhy/jsoup/issues/1910>
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing
private ArrayList<Element> formattingElements; // active (open) formatting elements
private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
private List<String> pendingTableCharacters; // chars in table to be shifted out
private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
private Token.EndTag emptyEnd; // reused empty end tag

private boolean framesetOk; // if ok to go into frameset
Expand Down Expand Up @@ -676,14 +676,20 @@ void setFormElement(FormElement formElement) {
this.formElement = formElement;
}

void newPendingTableCharacters() {
void resetPendingTableCharacters() {
pendingTableCharacters = new ArrayList<>();
}

List<String> getPendingTableCharacters() {
List<Token.Character> getPendingTableCharacters() {
return pendingTableCharacters;
}

void addPendingTableCharacters(Token.Character c) {
// make a clone of the token to maintain its state (as Tokens are otherwise reset)
Token.Character clone = c.clone();
pendingTableCharacters.add(clone);
}

/**
13.2.6.3 Closing elements that have implied end tags
When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements.
Expand Down
23 changes: 9 additions & 14 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.util.ArrayList;

Expand Down Expand Up @@ -995,7 +994,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
InTable {
boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isCharacter() && inSorted(tb.currentElement().normalName(), InTableFoster)) {
tb.newPendingTableCharacters();
tb.resetPendingTableCharacters();
tb.markInsertionMode();
tb.transition(InTableText);
return tb.process(t);
Expand Down Expand Up @@ -1106,25 +1105,25 @@ boolean process(Token t, HtmlTreeBuilder tb) {
tb.error(this);
return false;
} else {
tb.getPendingTableCharacters().add(c.getData());
tb.addPendingTableCharacters(c);
}
} else {// todo - don't really like the way these table character data lists are built
} else {
if (tb.getPendingTableCharacters().size() > 0) {
for (String character : tb.getPendingTableCharacters()) {
if (!isWhitespace(character)) {
for (Token.Character c : tb.getPendingTableCharacters()) {
if (!isWhitespace(c)) {
// InTable anything else section:
tb.error(this);
if (inSorted(tb.currentElement().normalName(), InTableFoster)) {
tb.setFosterInserts(true);
tb.process(new Token.Character().data(character), InBody);
tb.process(c, InBody);
tb.setFosterInserts(false);
} else {
tb.process(new Token.Character().data(character), InBody);
tb.process(c, InBody);
}
} else
tb.insert(new Token.Character().data(character));
tb.insert(c);
}
tb.newPendingTableCharacters();
tb.resetPendingTableCharacters();
}
tb.transition(tb.originalState());
return tb.process(t);
Expand Down Expand Up @@ -1759,10 +1758,6 @@ private static boolean isWhitespace(Token t) {
return false;
}

private static boolean isWhitespace(String data) {
return StringUtil.isBlank(data);
}

private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) {
tb.tokeniser.transition(TokeniserState.Rcdata);
tb.markInsertionMode();
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ public String toString() {
}
}

static class Character extends Token {
static class Character extends Token implements Cloneable {
private String data;

Character() {
Expand Down Expand Up @@ -410,6 +410,14 @@ String getData() {
public String toString() {
return getData();
}

@Override protected Token.Character clone() {
try {
return (Token.Character) super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
}

final static class CData extends Character {
Expand Down
22 changes: 22 additions & 0 deletions src/test/java/org/jsoup/nodes/PositionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import org.jsoup.Jsoup;
import org.jsoup.integration.servlets.FileServlet;
import org.jsoup.parser.Parser;
import org.jsoup.select.NodeTraversor;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

Expand Down Expand Up @@ -169,4 +172,23 @@ class PositionTest {
assertEquals("17,5:779-17,12:786", item.endSourceRange().toString());
}

@Test void tracksTableMovedText() {
String html = "<table>foo<tr>bar<td>baz</td>qux</tr>coo</table>";
Document doc = Jsoup.parse(html, TrackingParser);

List<TextNode> textNodes = new ArrayList<>();
NodeTraversor.traverse((Node node, int depth) -> {
if (node instanceof TextNode) {
textNodes.add((TextNode) node);
}
}, doc);

assertEquals(5, textNodes.size());
assertEquals("1,8:7-1,11:10", textNodes.get(0).sourceRange().toString());
assertEquals("1,15:14-1,18:17", textNodes.get(1).sourceRange().toString());
assertEquals("1,22:21-1,25:24", textNodes.get(2).sourceRange().toString());
assertEquals("1,30:29-1,33:32", textNodes.get(3).sourceRange().toString());
assertEquals("1,38:37-1,41:40", textNodes.get(4).sourceRange().toString());
}

}

0 comments on commit c93ea51

Please sign in to comment.