Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplified the Entities.escape method #2183

Merged
merged 2 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
* In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost /
root `Document` node. This enables source position tracking on the Document node (which was previously unset). And
it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182)
* Speed optimized `html()` and `Entities.escape()` when the input contains UTF characters in a supplementary plane, by
around 22%. [2183](https://github.com/jhy/jsoup/pull/2183)

### Bug Fixes

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
accum.append(key);
if (!shouldCollapseAttribute(key, val, out)) {
accum.append("=\"");
Entities.escape(accum, Attributes.checkNotNull(val) , out, false, true, false, false, false);
Entities.escape(accum, Attributes.checkNotNull(val), out, Entities.ForAttribute); // preserves whitespace
accum.append('"');
}
}
Expand Down
157 changes: 82 additions & 75 deletions src/main/java/org/jsoup/nodes/Entities.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
* HTML named character references</a>.
*/
public class Entities {
// constants for escape options:
static final int ForText = 0x1;
static final int ForAttribute = 0x2;
static final int Normalise = 0x4;
static final int TrimLeading = 0x8;
static final int TrimTrailing = 0x10;

private static final int empty = -1;
private static final String emptyName = "";
static final int codepointRadix = 36;
Expand Down Expand Up @@ -69,10 +76,6 @@ String nameForCodepoint(final int codepoint) {
}
return emptyName;
}

private int size() {
return nameKeys.length;
}
}

private Entities() {
Expand Down Expand Up @@ -144,7 +147,7 @@ public static String escape(String string, OutputSettings out) {
return "";
StringBuilder accum = StringUtil.borrowBuilder();
try {
escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces
escape(accum, string, out, ForText | ForAttribute); // for text and for attribute; preserve whitespaces
} catch (IOException e) {
throw new SerializationException(e); // doesn't happen
}
Expand All @@ -166,27 +169,24 @@ public static String escape(String string) {
}
private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings

// this method does a lot, but other breakups cause rescanning and stringbuilder generations
static void escape(Appendable accum, String string, OutputSettings out,
boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {

boolean lastWasWhite = false;
boolean reachedNonWhite = false;
static void escape(Appendable accum, String string, OutputSettings out, int options) throws IOException {
final EscapeMode escapeMode = out.escapeMode();
final CharsetEncoder encoder = out.encoder();
final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
final int length = string.length();

int codePoint;
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
boolean skipped = false;
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
codePoint = string.codePointAt(offset);

if (normaliseWhite) {
if ((options & Normalise) != 0) {
if (StringUtil.isWhitespace(codePoint)) {
if (stripLeadingWhite && !reachedNonWhite) continue;
if ((options & TrimLeading) != 0 && !reachedNonWhite) continue;
if (lastWasWhite) continue;
if (trimTrailing) {
if ((options & TrimTrailing) != 0) {
skipped = true;
continue;
}
Expand All @@ -202,71 +202,78 @@ static void escape(Appendable accum, String string, OutputSettings out,
}
}
}
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
final char c = (char) codePoint;
// html specific and required escapes:
switch (c) {
case '&':
accum.append("&amp;");
break;
case 0xA0:
if (escapeMode != EscapeMode.xhtml)
accum.append("&nbsp;");
else
accum.append("&#xa0;");
break;
case '<':
// escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
accum.append("&lt;");
else
accum.append(c);
break;
case '>':
if (forText)
accum.append("&gt;");
else
accum.append(c);
break;
case '"':
if (forAttribute)
accum.append("&quot;");
else
accum.append(c);
break;
case '\'':
if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
if (escapeMode == EscapeMode.xhtml)
accum.append("&#x27;");
else
accum.append("&apos;");
}
else
accum.append(c);
break;
// we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
case 0x9:
case 0xA:
case 0xD:
accum.append(c);
break;
default:
if (c < 0x20 || !canEncode(coreCharset, c, encoder))
appendEncoded(accum, escapeMode, codePoint);
else
accum.append(c);
}
} else {
final String c = new String(Character.toChars(codePoint));
if (encoder.canEncode(c)) // uses fallback encoder for simplicity
appendEscaped(accum, out, options, codePoint, escapeMode, encoder, coreCharset);
}
}

private static void appendEscaped(Appendable accum, OutputSettings out, int options,
int codePoint, EscapeMode escapeMode, CharsetEncoder encoder, CoreCharset coreCharset) throws IOException {

// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
final char c = (char) codePoint;
if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// html specific and required escapes:
switch (c) {
case '&':
accum.append("&amp;");
break;
case 0xA0:
appendNbsp(accum, escapeMode);
break;
case '<':
// escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
appendLt(accum, options, escapeMode, out);
break;
case '>':
if ((options & ForText) != 0) accum.append("&gt;");
else accum.append(c);
break;
case '"':
if ((options & ForAttribute) != 0) accum.append("&quot;");
else accum.append(c);
break;
case '\'':
// special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
appendApos(accum, options, escapeMode);
break;
// we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
case 0x9:
case 0xA:
case 0xD:
accum.append(c);
else
appendEncoded(accum, escapeMode, codePoint);
break;
default:
if (c < 0x20 || !canEncode(coreCharset, c, encoder)) appendEncoded(accum, escapeMode, codePoint);
else accum.append(c);
}
} else {
if (canEncode(coreCharset, c, encoder)) {
String s = new String(Character.toChars(codePoint));
accum.append(s);
} else appendEncoded(accum, escapeMode, codePoint);
}
}

private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException {
if (escapeMode != EscapeMode.xhtml) accum.append("&nbsp;");
else accum.append("&#xa0;");
}

private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, OutputSettings out) throws IOException {
if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) accum.append("&lt;");
else accum.append('<');
}

private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException {
if ((options & ForAttribute) != 0 && (options & ForText) != 0) {
if (escapeMode == EscapeMode.xhtml) accum.append("&#x27;");
else accum.append("&apos;");
} else {
accum.append('\'');
}
}


private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
final String name = escapeMode.nameForCodepoint(codePoint);
if (!emptyName.equals(name)) // ok for identity check
Expand Down Expand Up @@ -315,7 +322,7 @@ private static boolean canEncode(final CoreCharset charset, final char c, final
case ascii:
return c < 0x80;
case utf:
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar
default:
return fallback.canEncode(c);
}
Expand Down
15 changes: 9 additions & 6 deletions src/main/java/org/jsoup/nodes/TextNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,17 @@ public TextNode splitText(int offset) {
@Override
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
final boolean prettyPrint = out.prettyPrint();
final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);
final boolean trimLikeBlock = parent != null && (parent.tag().isBlock() || parent.tag().formatAsBlock());
boolean trimLeading = false, trimTrailing = false;
int escape = Entities.ForText;

if (normaliseWhite) {
trimLeading = (trimLikeBlock && siblingIndex == 0) || parentNode instanceof Document;
trimTrailing = trimLikeBlock && nextSibling() == null;
escape |= Entities.Normalise;
final Element parent = parentNode instanceof Element ? ((Element) parentNode) : null;
final boolean trimLikeBlock = parent != null && (parent.tag().isBlock() || parent.tag().formatAsBlock());
if ((trimLikeBlock && siblingIndex == 0) || parentNode instanceof Document)
escape |= Entities.TrimLeading;
if (trimLikeBlock && nextSibling() == null)
escape |= Entities.TrimTrailing;

// if this text is just whitespace, and the next node will cause an indent, skip this text:
Node next = nextSibling();
Expand All @@ -110,7 +113,7 @@ void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) thr
indent(accum, depth, out);
}

Entities.escape(accum, coreValue(), out, true, false, normaliseWhite, trimLeading, trimTrailing);
Entities.escape(accum, coreValue(), out, escape);
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/XmlDeclaration.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
accum.append(key);
if (!val.isEmpty()) {
accum.append("=\"");
Entities.escape(accum, val, out, false, true, false, false, false);
Entities.escape(accum, val, out, Entities.ForAttribute);
accum.append('"');
}
}
Expand Down
Loading