From f231676ab4fa48234da8eab8a58caaa4eaa42e22 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 30 Jul 2024 12:47:39 +1000 Subject: [PATCH 1/2] Simplified the Entities.escape method Introduced an options bitset instead of all those boolean method options. Reduced cyclomatic complexity of from 29 to 14. Improved throughput of escape around 22% if the content contains characters in supplemental plane, by no longer going through the charset encoder to test can encode, but pushing it into the CoreCharset. That removes a ByteBuffer allocation on each hit. --- CHANGES.md | 2 + src/main/java/org/jsoup/nodes/Attribute.java | 2 +- src/main/java/org/jsoup/nodes/Entities.java | 157 +++++++++--------- src/main/java/org/jsoup/nodes/TextNode.java | 15 +- .../java/org/jsoup/nodes/XmlDeclaration.java | 2 +- 5 files changed, 95 insertions(+), 83 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 9db0887ac1..4cbaa6c520 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,8 @@ * In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost / root `Document` node. This enables source position tracking on the Document node (which was previously unset). And it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182) +* Speed optimized `html()` and `Entities.escape()` when the input contains UTF characters in a supplementary plane, by + around 22%. ### Bug Fixes diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index 6dcc37e944..3451b2338d 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -169,7 +169,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D accum.append(key); if (!shouldCollapseAttribute(key, val, out)) { accum.append("=\""); - Entities.escape(accum, Attributes.checkNotNull(val) , out, false, true, false, false, false); + Entities.escape(accum, Attributes.checkNotNull(val), out, Entities.ForAttribute); // preserves whitespace accum.append('"'); } } diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index a5585780ee..4968067891 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -22,6 +22,13 @@ * HTML named character references. */ public class Entities { + // constants for escape options: + static final int ForText = 0x1; + static final int ForAttribute = 0x2; + static final int Normalise = 0x4; + static final int TrimLeading = 0x8; + static final int TrimTrailing = 0x10; + private static final int empty = -1; private static final String emptyName = ""; static final int codepointRadix = 36; @@ -69,10 +76,6 @@ String nameForCodepoint(final int codepoint) { } return emptyName; } - - private int size() { - return nameKeys.length; - } } private Entities() { @@ -144,7 +147,7 @@ public static String escape(String string, OutputSettings out) { return ""; StringBuilder accum = StringUtil.borrowBuilder(); try { - escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces + escape(accum, string, out, ForText | ForAttribute); // for text and for attribute; preserve whitespaces } catch (IOException e) { throw new SerializationException(e); // doesn't happen } @@ -166,27 +169,24 @@ public static String escape(String string) { } private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings - // this method does a lot, but other breakups cause rescanning and stringbuilder generations - static void escape(Appendable accum, String string, OutputSettings out, - boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { - - boolean lastWasWhite = false; - boolean reachedNonWhite = false; + static void escape(Appendable accum, String string, OutputSettings out, int options) throws IOException { final EscapeMode escapeMode = out.escapeMode(); final CharsetEncoder encoder = out.encoder(); final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() final int length = string.length(); int codePoint; + boolean lastWasWhite = false; + boolean reachedNonWhite = false; boolean skipped = false; for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { codePoint = string.codePointAt(offset); - if (normaliseWhite) { + if ((options & Normalise) != 0) { if (StringUtil.isWhitespace(codePoint)) { - if (stripLeadingWhite && !reachedNonWhite) continue; + if ((options & TrimLeading) != 0 && !reachedNonWhite) continue; if (lastWasWhite) continue; - if (trimTrailing) { + if ((options & TrimTrailing) != 0) { skipped = true; continue; } @@ -202,71 +202,78 @@ static void escape(Appendable accum, String string, OutputSettings out, } } } - // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): - if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - final char c = (char) codePoint; - // html specific and required escapes: - switch (c) { - case '&': - accum.append("&"); - break; - case 0xA0: - if (escapeMode != EscapeMode.xhtml) - accum.append(" "); - else - accum.append(" "); - break; - case '<': - // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val - if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) - accum.append("<"); - else - accum.append(c); - break; - case '>': - if (forText) - accum.append(">"); - else - accum.append(c); - break; - case '"': - if (forAttribute) - accum.append("""); - else - accum.append(c); - break; - case '\'': - if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. - if (escapeMode == EscapeMode.xhtml) - accum.append("'"); - else - accum.append("'"); - } - else - accum.append(c); - break; - // we escape ascii control ': + if ((options & ForText) != 0) accum.append(">"); + else accum.append(c); + break; + case '"': + if ((options & ForAttribute) != 0) accum.append("""); + else accum.append(c); + break; + case '\'': + // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. + appendApos(accum, options, escapeMode); + break; + // we escape ascii control Date: Tue, 30 Jul 2024 13:02:01 +1000 Subject: [PATCH 2/2] Avoid version sniffer issue --- CHANGES.md | 2 +- src/main/java/org/jsoup/nodes/Entities.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 4cbaa6c520..d8f914f544 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,7 +10,7 @@ root `Document` node. This enables source position tracking on the Document node (which was previously unset). And it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182) * Speed optimized `html()` and `Entities.escape()` when the input contains UTF characters in a supplementary plane, by - around 22%. + around 22%. [2183](https://github.com/jhy/jsoup/pull/2183) ### Bug Fixes diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index 4968067891..36720abba4 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -322,7 +322,7 @@ private static boolean canEncode(final CoreCharset charset, final char c, final case ascii: return c < 0x80; case utf: - return !Character.isSurrogate(c); + return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar default: return fallback.canEncode(c); }