From 0c715366108951b623b59166e1d5142497a6df68 Mon Sep 17 00:00:00 2001 From: Magmaruss Date: Sun, 18 Jun 2023 15:42:50 +0200 Subject: [PATCH 1/9] Support for entity surrogate pairs (#165) --- .../java/com/ctc/wstx/api/ReaderConfig.java | 24 +++ .../com/ctc/wstx/api/WstxInputProperties.java | 7 + .../java/com/ctc/wstx/sr/StreamScanner.java | 142 +++++++++++++----- .../org/codehaus/stax/test/BaseStaxTest.java | 10 ++ .../stax/test/stream/TestEntityRead.java | 122 ++++++++++++--- 5 files changed, 251 insertions(+), 54 deletions(-) diff --git a/src/main/java/com/ctc/wstx/api/ReaderConfig.java b/src/main/java/com/ctc/wstx/api/ReaderConfig.java index 56a65700..8c6e43c5 100644 --- a/src/main/java/com/ctc/wstx/api/ReaderConfig.java +++ b/src/main/java/com/ctc/wstx/api/ReaderConfig.java @@ -138,6 +138,8 @@ public final class ReaderConfig final static int PROP_MAX_ENTITY_DEPTH = 68; final static int PROP_MAX_DTD_DEPTH = 69; + + final static int PROP_RESOLVE_ENTITY_SURROGATE_PAIRS = 70; /* //////////////////////////////////////////////// @@ -361,6 +363,8 @@ public final class ReaderConfig PROP_UNDECLARED_ENTITY_RESOLVER); sProperties.put(WstxInputProperties.P_BASE_URL, PROP_BASE_URL); + sProperties.put(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS, + PROP_RESOLVE_ENTITY_SURROGATE_PAIRS); sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE, PROP_INPUT_PARSING_MODE); } @@ -418,6 +422,11 @@ public final class ReaderConfig * references */ protected URL mBaseURL; + + /** + * Resolve surrogate pairs of entities (2 code-points as one target character) + */ + protected boolean mResolveEntitySurrogatePairs = Boolean.FALSE; /** * Parsing mode can be changed from the default xml compliant @@ -583,6 +592,7 @@ public ReaderConfig createNonShared(SymbolTable sym) rc.mMaxEntityDepth = mMaxEntityDepth; rc.mMaxEntityCount = mMaxEntityCount; rc.mMaxDtdDepth = mMaxDtdDepth; + rc.mResolveEntitySurrogatePairs = mResolveEntitySurrogatePairs; if (mSpecialProperties != null) { int len = mSpecialProperties.length; Object[] specProps = new Object[len]; @@ -791,6 +801,10 @@ public XMLResolver getUndeclaredEntityResolver() { } public URL getBaseURL() { return mBaseURL; } + + public boolean isResolvingEntitySurrogatePairs() { + return mResolveEntitySurrogatePairs; + } public WstxInputProperties.ParsingMode getInputParsingMode() { return mParsingMode; @@ -1074,6 +1088,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) { } public void setBaseURL(URL baseURL) { mBaseURL = baseURL; } + + public void setResolveEntitySurrogatePairs(boolean resolveEntitySurrogatePairs) { + mResolveEntitySurrogatePairs = resolveEntitySurrogatePairs; + } public void setInputParsingMode(WstxInputProperties.ParsingMode mode) { mParsingMode = mode; @@ -1533,6 +1551,8 @@ public Object getProperty(int id) return getUndeclaredEntityResolver(); case PROP_BASE_URL: return getBaseURL(); + case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS: + return isResolvingEntitySurrogatePairs(); case PROP_INPUT_PARSING_MODE: return getInputParsingMode(); @@ -1757,6 +1777,10 @@ public boolean setProperty(String propName, int id, Object value) setBaseURL(u); } break; + + case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS: + setResolveEntitySurrogatePairs(ArgUtil.convertToBoolean(propName, value)); + break; case PROP_INPUT_PARSING_MODE: setInputParsingMode((WstxInputProperties.ParsingMode) value); diff --git a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java index 840e7ddd..9625dcdb 100644 --- a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java +++ b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java @@ -300,6 +300,13 @@ public final class WstxInputProperties * DTD subset). */ public final static String P_BASE_URL = "com.ctc.wstx.baseURL"; + + /** + * Property of type {@link java.lang.Boolean}, that will allow parsing + * high unicode characters written by surrogate pairs (2 code points) + * Default set as Boolean.FALSE, because it is not a standard behavior + */ + public final static String P_RESOLVE_ENTITY_SURROGATE_PAIRS = "com.ctc.wstx.resolveEntitySurrogatePairs"; // // // Alternate parsing modes diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index d2257a6a..455df74b 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1188,55 +1188,121 @@ protected int resolveSimpleEntity(boolean checkStd) if (c == '#') { c = buf[ptr++]; int value = 0; + // Turns to 0 when first entity is read (must be a high surrogate char) + int pairValue = -1; int inputLen = mInputEnd; - if (c == 'x') { // hex - while (ptr < inputLen) { - c = buf[ptr++]; - if (c == ';') { - break; - } - value = value << 4; - if (c <= '9' && c >= '0') { - value += (c - '0'); - } else if (c >= 'a' && c <= 'f') { - value += (10 + (c - 'a')); - } else if (c >= 'A' && c <= 'F') { - value += (10 + (c - 'A')); - } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); + boolean isValueHighSurrogate = false; + + /* + * Loop used only to continue iteration to search surrogate pair. + * If simple entity, it does only one iteration + */ + while (value == 0 + || (mConfig.isResolvingEntitySurrogatePairs() + && value >= 0xD800 && value <= 0xDBFF && pairValue <= 0)) { + + int tmpValue = pairValue >= 0 ? pairValue : value; + + if (c == 'x') { // hex + while (ptr < inputLen) { + c = buf[ptr++]; + if (c == ';') { + break; + } + + tmpValue = tmpValue << 4; + if (c <= '9' && c >= '0') { + tmpValue += (c - '0'); + } else if (c >= 'a' && c <= 'f') { + tmpValue += (10 + (c - 'a')); + } else if (c >= 'A' && c <= 'F') { + tmpValue += (10 + (c - 'A')); + } else { + mInputPtr = ptr; // so error points to correct char + throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); + } + /* Need to check for overflow; easiest to do right as + * it happens... + */ + if (tmpValue > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } } - /* Need to check for overflow; easiest to do right as - * it happens... - */ - if (value > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); + } else { // numeric (decimal) + while (c != ';') { + if (c <= '9' && c >= '0') { + tmpValue = (tmpValue * 10) + (c - '0'); + // Overflow? + if (tmpValue > MAX_UNICODE_CHAR) { + reportUnicodeOverflow(); + } + + } else { + mInputPtr = ptr; // so error points to correct char + throwUnexpectedChar(c, "; expected a decimal number."); + } + if (ptr >= inputLen) { + break; + } + c = buf[ptr++]; } } - } else { // numeric (decimal) - while (c != ';') { - if (c <= '9' && c >= '0') { - value = (value * 10) + (c - '0'); - // Overflow? - if (value > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); + + if (pairValue >= 0) { + pairValue = tmpValue; + } else { + value = tmpValue; + } + + isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; + + /* + * If resolving entity surrogate pairs enabled and if current entity + * is in range of high surrogate value, try to find surrogate pair + */ + if (isValueHighSurrogate && mConfig.isResolvingEntitySurrogatePairs() + && c == ';' && pairValue < 0 && ptr + 1 < inputLen) { + c = buf[ptr++]; + + if (c == '&' && ptr + 1 < inputLen) { + c = buf[ptr++]; + + if (c == '#' && ptr + 1 < inputLen) { + c = buf[ptr++]; + pairValue = 0; + continue; + } else { + reportNoSurrogatePair(value); } } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a decimal number."); + reportNoSurrogatePair(value); } - if (ptr >= inputLen) { - break; - } - c = buf[ptr++]; + } else if (isValueHighSurrogate + && mConfig.isResolvingEntitySurrogatePairs() + && ptr + 1 >= inputLen) { + reportNoSurrogatePair(value); } } + /* We get here either if we got it all, OR if we ran out of * input in current buffer. */ if (c == ';') { // got the full thing mInputPtr = ptr; - validateChar(value); + + if (mConfig.isResolvingEntitySurrogatePairs() && pairValue > 0) { + /* + * If pair value is not in range of low surrogate values, then throw an error + */ + if (pairValue < 0xDC00 || pairValue > 0xDFFF) { + reportNoSurrogatePair(value); + } + + value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00); + } else { + validateChar(value); + } + return value; } @@ -2464,6 +2530,12 @@ private void reportIllegalChar(int value) { throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null); } + + private void reportNoSurrogatePair(int highSurrogate) + throws XMLStreamException + { + throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null); + } protected void verifyLimit(String type, long maxValue, long currentValue) throws XMLStreamException diff --git a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java index b5e1438d..f1348731 100644 --- a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java +++ b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java @@ -8,6 +8,8 @@ import javax.xml.stream.*; import javax.xml.stream.events.XMLEvent; +import com.ctc.wstx.api.WstxInputProperties; + /* Latest updates: * * - 07-Sep-2007, TSa: Updating based on latest understanding of @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s return false; } } + + protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state) + throws XMLStreamException + { + Boolean b = state ? Boolean.TRUE : Boolean.FALSE; + f.setProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS, b); + assertEquals(b, f.getProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS)); + } protected static void setResolver(XMLInputFactory f, XMLResolver resolver) throws XMLStreamException diff --git a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java index cb92c6c9..f0efdba0 100644 --- a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java +++ b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java @@ -1,5 +1,9 @@ package org.codehaus.stax.test.stream; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + import javax.xml.stream.*; import org.codehaus.stax.test.SimpleResolver; @@ -27,7 +31,7 @@ public void testValidPredefdEntities() String EXP = "Testing \"this\" & 'that' !? !"; String XML = "Testing "this" & 'that' !? !"; - XMLStreamReader sr = getReader(XML, false, true, true); + XMLStreamReader sr = getReader(XML, false, true, true, false); assertTokenType(START_ELEMENT, sr.next()); assertTokenType(CHARACTERS, sr.next()); @@ -52,7 +56,7 @@ public void testValidCharEntities() throws XMLStreamException { String XML = "surrogates: 񐀀."; - XMLStreamReader sr = getReader(XML, true, true, true); + XMLStreamReader sr = getReader(XML, true, true, true, false); assertTokenType(START_ELEMENT, sr.next()); assertTokenType(CHARACTERS, sr.next()); @@ -73,6 +77,52 @@ public void testValidCharEntities() assertTokenType(END_ELEMENT, type); sr.close(); } + + /** + * This unit test checks that resolving of surrogate pairs works + * as expected, including different ways of writing entities + */ + public void testValidSurrogatePairEntities() + throws XMLStreamException + { + final Map xmlWithExp = new HashMap(); + // Numeric surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Hex and numeric surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Numeric and hex surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Hex surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Two surrogate pairs + xmlWithExp.put("surrogate pair: ����.", + "surrogate pair: \uD83C\uDF85\uD83C\uDF84."); + // Surrogate pair and simple entity + xmlWithExp.put("surrogate pair: ��™.", + "surrogate pair: \uD83C\uDF85\u2122."); + + for (Entry xmlExp: xmlWithExp.entrySet()) { + XMLStreamReader sr = getReader(xmlExp.getKey(), true, true, true, true); + assertTokenType(START_ELEMENT, sr.next()); + assertTokenType(CHARACTERS, sr.next()); + + StringBuffer sb = new StringBuffer(getAndVerifyText(sr)); + int type; + + while ((type = sr.next()) == CHARACTERS) { + sb.append(getAndVerifyText(sr)); + } + + String result = sb.toString(); + assertEquals(xmlExp.getValue(), result); + assertTokenType(END_ELEMENT, type); + sr.close(); + } + } public void testValidGeneralEntities() throws XMLStreamException @@ -85,7 +135,7 @@ public void testValidGeneralEntities() +"]>\n" +"&x; &both; &aa;&myAmp;"; - XMLStreamReader sr = getReader(XML, false, true, true); + XMLStreamReader sr = getReader(XML, false, true, true, false); assertTokenType(DTD, sr.next()); int type = sr.next(); @@ -126,7 +176,7 @@ public void testUnexpandedEntities() +" ]>\n" +"&Start"&myent;End!"; - XMLStreamReader sr = getReader(XML, false, true, false); + XMLStreamReader sr = getReader(XML, false, true, false, false); assertTokenType(DTD, sr.next()); int type = sr.next(); @@ -163,11 +213,11 @@ public void testUnexpandedEntities() +""; // First, no coalescing - sr = getReader(XML, false, false, false); + sr = getReader(XML, false, false, false, false); streamThrough(sr); // then with coalescing - sr = getReader(XML, false, true, false); + sr = getReader(XML, false, true, false, false); streamThrough(sr); } @@ -184,7 +234,7 @@ public void testUnexpandedEntities2() +" ]>" +"&myent;"; - XMLStreamReader sr = getReader(XML, false, true, false); + XMLStreamReader sr = getReader(XML, false, true, false, false); assertTokenType(DTD, sr.next()); assertTokenType(START_ELEMENT, sr.next()); @@ -225,7 +275,7 @@ public void testElementEntities() +"]>\n" +"&ent1;&ent2;&ent3;&ent4a;"; - XMLStreamReader sr = getReader(XML, true, true, true); + XMLStreamReader sr = getReader(XML, true, true, true, false); assertTokenType(DTD, sr.next()); // May or may not get whitespace @@ -289,7 +339,7 @@ public void testQuotedCDataEndMarker() +" and then alternatives: ]]>" +", ]]>" +""; - XMLStreamReader sr = getReader(XML, true, false, true); + XMLStreamReader sr = getReader(XML, true, false, true, false); streamThrough(sr); } catch (Exception e) { fail("Didn't except problems with pre-def/char entity quoted ']]>'; got: "+e); @@ -303,7 +353,7 @@ public void testQuotedCDataEndMarker() +"" +" &doubleBracket;> and &doubleBracket;>" +""; - XMLStreamReader sr = getReader(XML, true, false, true); + XMLStreamReader sr = getReader(XML, true, false, true, false); streamThrough(sr); } catch (Exception e) { fail("Didn't except problems with general entity quoted ']]>'; got: "+e); @@ -326,7 +376,7 @@ public void testInvalidEntityUndeclared() throws XMLStreamException { XMLStreamReader sr = getReader("&myent;", - true, false, true); + true, false, true, false); try { streamThrough(sr); fail("Expected an exception for invalid comment content"); @@ -341,7 +391,7 @@ public void testInvalidEntityRecursive() +"\n" +"\n" +"]> &ent1;", - false, true, true); + false, true, true, false); streamThroughFailing(sr, "recursive general entity/ies"); @@ -363,7 +413,7 @@ public void testInvalidEntityPEInIntSubset() +"\n" +"\n" +"]> ", - false, true, true); + false, true, true, false); streamThroughFailing(sr, "declaring a parameter entity in the internal DTD subset"); } @@ -382,7 +432,7 @@ public void testInvalidEntityPartial() ("\n" +"]>&partial;;", - false, false, true); + false, false, true, false); /* Hmmh. Actually, fully conforming implementations should throw * an exception when parsing internal DTD subset. But better @@ -407,6 +457,35 @@ public void testInvalidEntityPartial() assertTokenType(START_ELEMENT, type2); fail("Expected an exception for partial entity reference: current token after text: "+tokenTypeDesc(lastType)); } + + /** + * Test that ensures that an invalid surrogate pair entities is caught. + * It could be pair of high surrogate and simple entity, high surrogate + * with no pair, low surrogate as first or unclosed entity + */ + public void testInvalidSurrogatePairEntities() + throws XMLStreamException + { + final String[] invalidSurrogatePairs = { + // Invalid pair + "surrogate pair: �ᙚ.", + // No pair + "surrogate pair: �.", + // Low surrogate as first + "surrogate pair: ��.", + // Unclosed second entity + "surrogate pair: �ȩ" + }; + + for (String invalidSurrogatePair: invalidSurrogatePairs) { + XMLStreamReader sr = getReader(invalidSurrogatePair, + true, false, true, true); + try { + streamThrough(sr); + fail("Expected an exception for invalid surrogate pair"); + } catch (Exception e) { } + } + } /** * This unit test checks that external entities can be resolved; and @@ -424,7 +503,7 @@ public void testExternalEntityWithResolver() +"]>ent='&extEnt;'"; // ns-aware, coalescing (to simplify verifying), entity expanding - XMLInputFactory f = doGetFactory(true, true, true); + XMLInputFactory f = doGetFactory(true, true, true, false); if (!setSupportExternalEntities(f, true)) { reportNADueToExtEnt("testExternalEntityWithResolver"); @@ -482,7 +561,7 @@ private void doTestProperties(boolean nsAware) +"\n" +"\n" +"]>&myent;&ent2;", - nsAware, false, false); + nsAware, false, false, false); assertTokenType(DTD, sr.next()); assertTokenType(START_ELEMENT, sr.next()); @@ -611,15 +690,19 @@ private void doTestProperties(boolean nsAware) * need to be enabled just for that purpose. */ private XMLStreamReader getReader(String contents, boolean nsAware, - boolean coalescing, boolean replEntities) + boolean coalescing, boolean replEntities, + boolean resolveSurrogatePairs) throws XMLStreamException { - XMLInputFactory f = doGetFactory(nsAware, coalescing, replEntities); + XMLInputFactory f = doGetFactory(nsAware, coalescing, + replEntities, resolveSurrogatePairs); return constructStreamReader(f, contents); } private XMLInputFactory doGetFactory(boolean nsAware, - boolean coalescing, boolean replEntities) + boolean coalescing, + boolean replEntities, + boolean resolveSurrogatePairs) throws XMLStreamException { XMLInputFactory f = getInputFactory(); @@ -629,6 +712,7 @@ private XMLInputFactory doGetFactory(boolean nsAware, setSupportExternalEntities(f, true); setReplaceEntities(f, replEntities); setValidating(f, false); + setResolveEntitySurrogatePairs(f, resolveSurrogatePairs); return f; } } From d85a50dbb1fc653b199a9632e767779dd0216091 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Sun, 12 Nov 2023 18:29:28 -0800 Subject: [PATCH 2/9] Made changes I suggested in PR code review, so naming kept consistent. --- .../java/com/ctc/wstx/api/ReaderConfig.java | 37 +++++++++++-------- .../com/ctc/wstx/api/WstxInputProperties.java | 4 +- .../java/com/ctc/wstx/sr/StreamScanner.java | 8 ++-- .../org/codehaus/stax/test/BaseStaxTest.java | 4 +- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/ctc/wstx/api/ReaderConfig.java b/src/main/java/com/ctc/wstx/api/ReaderConfig.java index 8c6e43c5..d71be319 100644 --- a/src/main/java/com/ctc/wstx/api/ReaderConfig.java +++ b/src/main/java/com/ctc/wstx/api/ReaderConfig.java @@ -138,8 +138,11 @@ public final class ReaderConfig final static int PROP_MAX_ENTITY_DEPTH = 68; final static int PROP_MAX_DTD_DEPTH = 69; - - final static int PROP_RESOLVE_ENTITY_SURROGATE_PAIRS = 70; + + /** + * @since 6.6 + */ + final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70; /* //////////////////////////////////////////////// @@ -363,8 +366,8 @@ public final class ReaderConfig PROP_UNDECLARED_ENTITY_RESOLVER); sProperties.put(WstxInputProperties.P_BASE_URL, PROP_BASE_URL); - sProperties.put(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS, - PROP_RESOLVE_ENTITY_SURROGATE_PAIRS); + sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, + PROP_ALLOW_SURROGATE_PAIR_ENTITIES); sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE, PROP_INPUT_PARSING_MODE); } @@ -424,9 +427,11 @@ public final class ReaderConfig protected URL mBaseURL; /** - * Resolve surrogate pairs of entities (2 code-points as one target character) + * Whether to allow surrogate pairs as entities (2 code-points as one target character). + * + * @since 66 */ - protected boolean mResolveEntitySurrogatePairs = Boolean.FALSE; + protected boolean mAllowSurrogatePairEntities = false; /** * Parsing mode can be changed from the default xml compliant @@ -592,7 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym) rc.mMaxEntityDepth = mMaxEntityDepth; rc.mMaxEntityCount = mMaxEntityCount; rc.mMaxDtdDepth = mMaxDtdDepth; - rc.mResolveEntitySurrogatePairs = mResolveEntitySurrogatePairs; + rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities; if (mSpecialProperties != null) { int len = mSpecialProperties.length; Object[] specProps = new Object[len]; @@ -801,9 +806,9 @@ public XMLResolver getUndeclaredEntityResolver() { } public URL getBaseURL() { return mBaseURL; } - - public boolean isResolvingEntitySurrogatePairs() { - return mResolveEntitySurrogatePairs; + + public boolean allowsSurrogatePairEntities() { + return mAllowSurrogatePairEntities; } public WstxInputProperties.ParsingMode getInputParsingMode() { @@ -1089,8 +1094,8 @@ public void setUndeclaredEntityResolver(XMLResolver r) { public void setBaseURL(URL baseURL) { mBaseURL = baseURL; } - public void setResolveEntitySurrogatePairs(boolean resolveEntitySurrogatePairs) { - mResolveEntitySurrogatePairs = resolveEntitySurrogatePairs; + public void doAllowSurrogatePairEntities(boolean state) { + mAllowSurrogatePairEntities = state; } public void setInputParsingMode(WstxInputProperties.ParsingMode mode) { @@ -1551,8 +1556,8 @@ public Object getProperty(int id) return getUndeclaredEntityResolver(); case PROP_BASE_URL: return getBaseURL(); - case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS: - return isResolvingEntitySurrogatePairs(); + case PROP_ALLOW_SURROGATE_PAIR_ENTITIES: + return allowsSurrogatePairEntities(); case PROP_INPUT_PARSING_MODE: return getInputParsingMode(); @@ -1778,8 +1783,8 @@ public boolean setProperty(String propName, int id, Object value) } break; - case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS: - setResolveEntitySurrogatePairs(ArgUtil.convertToBoolean(propName, value)); + case PROP_ALLOW_SURROGATE_PAIR_ENTITIES: + doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value)); break; case PROP_INPUT_PARSING_MODE: diff --git a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java index 9625dcdb..40773f71 100644 --- a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java +++ b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java @@ -305,8 +305,10 @@ public final class WstxInputProperties * Property of type {@link java.lang.Boolean}, that will allow parsing * high unicode characters written by surrogate pairs (2 code points) * Default set as Boolean.FALSE, because it is not a standard behavior + * + * @since 6.6 */ - public final static String P_RESOLVE_ENTITY_SURROGATE_PAIRS = "com.ctc.wstx.resolveEntitySurrogatePairs"; + public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities"; // // // Alternate parsing modes diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index 455df74b..c85d2141 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1198,7 +1198,7 @@ protected int resolveSimpleEntity(boolean checkStd) * If simple entity, it does only one iteration */ while (value == 0 - || (mConfig.isResolvingEntitySurrogatePairs() + || (mConfig.allowsSurrogatePairEntities() && value >= 0xD800 && value <= 0xDBFF && pairValue <= 0)) { int tmpValue = pairValue >= 0 ? pairValue : value; @@ -1260,7 +1260,7 @@ protected int resolveSimpleEntity(boolean checkStd) * If resolving entity surrogate pairs enabled and if current entity * is in range of high surrogate value, try to find surrogate pair */ - if (isValueHighSurrogate && mConfig.isResolvingEntitySurrogatePairs() + if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities() && c == ';' && pairValue < 0 && ptr + 1 < inputLen) { c = buf[ptr++]; @@ -1278,7 +1278,7 @@ protected int resolveSimpleEntity(boolean checkStd) reportNoSurrogatePair(value); } } else if (isValueHighSurrogate - && mConfig.isResolvingEntitySurrogatePairs() + && mConfig.allowsSurrogatePairEntities() && ptr + 1 >= inputLen) { reportNoSurrogatePair(value); } @@ -1290,7 +1290,7 @@ protected int resolveSimpleEntity(boolean checkStd) if (c == ';') { // got the full thing mInputPtr = ptr; - if (mConfig.isResolvingEntitySurrogatePairs() && pairValue > 0) { + if (mConfig.allowsSurrogatePairEntities() && pairValue > 0) { /* * If pair value is not in range of low surrogate values, then throw an error */ diff --git a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java index f1348731..fea81ec7 100644 --- a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java +++ b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java @@ -282,8 +282,8 @@ protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean throws XMLStreamException { Boolean b = state ? Boolean.TRUE : Boolean.FALSE; - f.setProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS, b); - assertEquals(b, f.getProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS)); + f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b); + assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES)); } protected static void setResolver(XMLInputFactory f, XMLResolver resolver) From 8ed199ac8515a3a9843f04168a566bec374476e5 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Sun, 12 Nov 2023 18:35:07 -0800 Subject: [PATCH 3/9] Update release notes wrt this PR --- release-notes/CREDITS | 5 +++++ release-notes/VERSION | 2 ++ 2 files changed, 7 insertions(+) diff --git a/release-notes/CREDITS b/release-notes/CREDITS index 88502a76..73a1b381 100644 --- a/release-notes/CREDITS +++ b/release-notes/CREDITS @@ -84,3 +84,8 @@ Guillaume Nodet (@gnodet) * Contributed #176: Fix parser when not replacing entities and treating char references as entities (6.6.0) + +Kamil Gołębiewski (@Magmaruss) + +* Contributed #165: Add support to optionally allow surrogate pair entities + (6.6.0) diff --git a/release-notes/VERSION b/release-notes/VERSION index 798fb58a..554cc271 100644 --- a/release-notes/VERSION +++ b/release-notes/VERSION @@ -6,6 +6,8 @@ Project: woodstox 6.6.0 (not yet released) +#165: Add support to optionally allow surrogate pair entities + (contributed by Kamil G) #176: Fix parser when not replacing entities and treating char references as entities (contributed by Guillaume N) From 1dcb8be1dd7ee18a7a998e87004022983ba6fa42 Mon Sep 17 00:00:00 2001 From: Magmaruss Date: Mon, 4 Dec 2023 23:43:05 +0100 Subject: [PATCH 4/9] Changes after PR review --- .../java/com/ctc/wstx/sr/StreamScanner.java | 133 +++++++----------- .../stax/test/stream/TestEntityRead.java | 26 +++- 2 files changed, 67 insertions(+), 92 deletions(-) diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index bedbdda8..83c387b0 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1186,102 +1186,48 @@ protected int resolveSimpleEntity(boolean checkStd) // Numeric reference? if (c == '#') { - c = buf[ptr++]; int value = 0; - // Turns to 0 when first entity is read (must be a high surrogate char) - int pairValue = -1; + int pairValue = 0; int inputLen = mInputEnd; - boolean isValueHighSurrogate = false; + final StringBuffer buffer = new StringBuffer(new String(buf)); + + mInputPtr = ptr; + value = resolveCharEnt(buffer, false); + ptr = mInputPtr; + c = buf[ptr - 1]; + + final boolean isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; /* - * Loop used only to continue iteration to search surrogate pair. - * If simple entity, it does only one iteration + * If resolving entity surrogate pairs enabled and if current entity + * is in range of high surrogate value, try to find surrogate pair */ - while (value == 0 - || (mConfig.allowsSurrogatePairEntities() - && value >= 0xD800 && value <= 0xDBFF && pairValue <= 0)) { - - int tmpValue = pairValue >= 0 ? pairValue : value; - - if (c == 'x') { // hex - while (ptr < inputLen) { - c = buf[ptr++]; - if (c == ';') { - break; - } - - tmpValue = tmpValue << 4; - if (c <= '9' && c >= '0') { - tmpValue += (c - '0'); - } else if (c >= 'a' && c <= 'f') { - tmpValue += (10 + (c - 'a')); - } else if (c >= 'A' && c <= 'F') { - tmpValue += (10 + (c - 'A')); - } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); - } - /* Need to check for overflow; easiest to do right as - * it happens... - */ - if (tmpValue > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); - } - } - } else { // numeric (decimal) - while (c != ';') { - if (c <= '9' && c >= '0') { - tmpValue = (tmpValue * 10) + (c - '0'); - // Overflow? - if (tmpValue > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); - } - - } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a decimal number."); - } - if (ptr >= inputLen) { - break; - } - c = buf[ptr++]; - } - } - - if (pairValue >= 0) { - pairValue = tmpValue; - } else { - value = tmpValue; - } - - isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; + if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities() + && c == ';' && ptr + 1 < inputLen) { + c = buf[ptr++]; - /* - * If resolving entity surrogate pairs enabled and if current entity - * is in range of high surrogate value, try to find surrogate pair - */ - if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities() - && c == ';' && pairValue < 0 && ptr + 1 < inputLen) { + if (c == '&' && ptr + 1 < inputLen) { c = buf[ptr++]; - if (c == '&' && ptr + 1 < inputLen) { - c = buf[ptr++]; - - if (c == '#' && ptr + 1 < inputLen) { - c = buf[ptr++]; - pairValue = 0; - continue; - } else { + if (c == '#' && ptr + 1 < inputLen) { + try { + mInputPtr = ptr; + pairValue = resolveCharEnt(buffer, false); + ptr = mInputPtr; + c = buf[ptr -1]; + } catch(WstxUnexpectedCharException wuce) { reportNoSurrogatePair(value); } } else { reportNoSurrogatePair(value); } - } else if (isValueHighSurrogate - && mConfig.allowsSurrogatePairEntities() - && ptr + 1 >= inputLen) { + } else { reportNoSurrogatePair(value); } + } else if (isValueHighSurrogate + && mConfig.allowsSurrogatePairEntities() + && ptr + 1 >= inputLen) { + reportNoSurrogatePair(value); } /* We get here either if we got it all, OR if we ran out of @@ -1292,10 +1238,11 @@ protected int resolveSimpleEntity(boolean checkStd) if (mConfig.allowsSurrogatePairEntities() && pairValue > 0) { /* + * [woodstox-core#165] * If pair value is not in range of low surrogate values, then throw an error */ if (pairValue < 0xDC00 || pairValue > 0xDFFF) { - reportNoSurrogatePair(value); + reportInvalidSurrogatePair(value, pairValue); } value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00); @@ -2385,8 +2332,14 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs, // Internal methods /////////////////////////////////////////////////////////////////////// */ - + private int resolveCharEnt(StringBuffer originalCharacters) + throws XMLStreamException + { + return resolveCharEnt(originalCharacters, true); + } + + private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar) throws XMLStreamException { int value = 0; @@ -2441,7 +2394,11 @@ private int resolveCharEnt(StringBuffer originalCharacters) } } } - validateChar(value); + + if (validateChar) { + validateChar(value); + } + return value; } @@ -2528,7 +2485,7 @@ private void reportUnicodeOverflow() private void reportIllegalChar(int value) throws XMLStreamException { - throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null); + throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null); } private void reportNoSurrogatePair(int highSurrogate) @@ -2536,6 +2493,12 @@ private void reportNoSurrogatePair(int highSurrogate) { throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null); } + + private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate) + throws XMLStreamException + { + throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate)); + } protected void verifyLimit(String type, long maxValue, long currentValue) throws XMLStreamException diff --git a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java index f0efdba0..31500040 100644 --- a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java +++ b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java @@ -466,24 +466,36 @@ public void testInvalidEntityPartial() public void testInvalidSurrogatePairEntities() throws XMLStreamException { - final String[] invalidSurrogatePairs = { + final String[][] invalidSurrogatePairsAndExpectedErrors = { // Invalid pair - "surrogate pair: �ᙚ.", + {"surrogate pair: �ᙚ.", "Invalid surrogate pair"}, // No pair - "surrogate pair: �.", + {"surrogate pair: �.", "Cannot find surrogate pair"}, // Low surrogate as first - "surrogate pair: ��.", + {"surrogate pair: ��.", "Illegal character entity"}, // Unclosed second entity - "surrogate pair: �ȩ" + {"surrogate pair: �ȩ", "Cannot find surrogate pair"} }; - for (String invalidSurrogatePair: invalidSurrogatePairs) { + for (String[] surrogateAndError: invalidSurrogatePairsAndExpectedErrors) { + final String invalidSurrogatePair = surrogateAndError[0]; + final String expectedErrorPhrase = surrogateAndError[1]; + XMLStreamReader sr = getReader(invalidSurrogatePair, true, false, true, true); try { streamThrough(sr); fail("Expected an exception for invalid surrogate pair"); - } catch (Exception e) { } + } catch (XMLStreamException e) { + if (!e.getMessage().startsWith(expectedErrorPhrase)) { + fail(String.format( + "Expected an exception starting from phrase: '%s' for invalid surrogate test case: '%s', but the message was: '%s'", + expectedErrorPhrase, + invalidSurrogatePair, + e.getMessage() + )); + } + } } } From 3d34620d08b675be5e5e3ff17285d92fb2e4b5a6 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 6 Dec 2023 20:35:56 -0800 Subject: [PATCH 5/9] Minor simplification --- src/main/java/com/ctc/wstx/sr/StreamScanner.java | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index 83c387b0..50952d6c 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1198,8 +1198,7 @@ protected int resolveSimpleEntity(boolean checkStd) final boolean isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; - /* - * If resolving entity surrogate pairs enabled and if current entity + /* If resolving entity surrogate pairs enabled and if current entity * is in range of high surrogate value, try to find surrogate pair */ if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities() @@ -1368,7 +1367,7 @@ protected int resolveCharOnlyEntity(boolean checkStd) // A char reference? if (c == '#') { // yup ++mInputPtr; - return resolveCharEnt(null); + return resolveCharEnt(null, true); } // nope... except may be a pre-def? @@ -1536,7 +1535,7 @@ protected int fullyResolveEntity(boolean allowExt) // Do we have a (numeric) character entity reference? if (c == '#') { // numeric final StringBuffer originalSurface = new StringBuffer("#"); - int ch = resolveCharEnt(originalSurface); + int ch = resolveCharEnt(originalSurface, true); if (mCfgTreatCharRefsAsEntities) { final char[] originalChars = new char[originalSurface.length()]; originalSurface.getChars(0, originalSurface.length(), originalChars, 0); @@ -2332,12 +2331,6 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs, // Internal methods /////////////////////////////////////////////////////////////////////// */ - - private int resolveCharEnt(StringBuffer originalCharacters) - throws XMLStreamException - { - return resolveCharEnt(originalCharacters, true); - } private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar) throws XMLStreamException From d5c42c0b9cb03ac95371b023addde43f5172775d Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 6 Dec 2023 20:39:17 -0800 Subject: [PATCH 6/9] Fix one `@since` tag, trivial whitespace removal --- .../java/com/ctc/wstx/api/ReaderConfig.java | 6 +++--- .../com/ctc/wstx/sr/BasicStreamReader.java | 19 +++++++------------ .../java/com/ctc/wstx/sr/StreamScanner.java | 11 ++++------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/ctc/wstx/api/ReaderConfig.java b/src/main/java/com/ctc/wstx/api/ReaderConfig.java index d71be319..f6514dce 100644 --- a/src/main/java/com/ctc/wstx/api/ReaderConfig.java +++ b/src/main/java/com/ctc/wstx/api/ReaderConfig.java @@ -419,17 +419,17 @@ public final class ReaderConfig // since 5.4/6.4 protected int mMaxDtdDepth = DEFAULT_MAX_DTD_DEPTH; - + /** * Base URL to use as the resolution context for relative entity * references */ protected URL mBaseURL; - + /** * Whether to allow surrogate pairs as entities (2 code-points as one target character). * - * @since 66 + * @since 6.6 */ protected boolean mAllowSurrogatePairEntities = false; diff --git a/src/main/java/com/ctc/wstx/sr/BasicStreamReader.java b/src/main/java/com/ctc/wstx/sr/BasicStreamReader.java index 87eb99ee..bbab11f4 100644 --- a/src/main/java/com/ctc/wstx/sr/BasicStreamReader.java +++ b/src/main/java/com/ctc/wstx/sr/BasicStreamReader.java @@ -3671,9 +3671,7 @@ private int skipCoalescedText(int i) private int skipTokenText(int i) throws XMLStreamException { - /* Fairly easy; except for potential to have entities - * expand to some crap? - */ + // Fairly easy; except for potential to have entities expand to some crap? int count = 0; main_loop: @@ -3690,18 +3688,15 @@ && resolveSimpleEntity(true) != 0) { ; } else { i = fullyResolveEntity(true); - /* Either way, it's just fine; we don't care about - * returned single-char value. - */ + // Either way, it's just fine; we don't care about + // returned single-char value. } } else { - /* Can only skip character entities; others need to - * be returned separately. - */ + // Can only skip character entities; others need to + // be returned separately. if (resolveCharOnlyEntity(true) == 0) { - /* Now points to the char after ampersand, and we need - * to return the ampersand itself - */ + // Now points to the char after ampersand, and we need + // to return the ampersand itself return i; } } diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index 50952d6c..589bc73e 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -2337,11 +2337,11 @@ private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar { int value = 0; char c = getNextChar(SUFFIX_IN_ENTITY_REF); - + if (originalCharacters != null) { originalCharacters.append(c); } - + if (c == 'x') { // hex while (true) { c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] @@ -2387,11 +2387,9 @@ private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar } } } - if (validateChar) { validateChar(value); } - return value; } @@ -2402,9 +2400,8 @@ private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar private final void validateChar(int value) throws XMLStreamException { - /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic, - * need to be reported by a surrogate pair.. - */ + // 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic, + // need to be reported by a surrogate pair.. if (value >= 0xD800) { if (value < 0xE000) { // no surrogates via entity expansion reportIllegalChar(value); From eb5189d107d911709774c79e1ed445865567d7ff Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 6 Dec 2023 20:56:35 -0800 Subject: [PATCH 7/9] Further streamlining --- .../java/com/ctc/wstx/sr/StreamScanner.java | 74 ++++++++----------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index 589bc73e..fd8996c7 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1189,32 +1189,33 @@ protected int resolveSimpleEntity(boolean checkStd) int value = 0; int pairValue = 0; int inputLen = mInputEnd; - final StringBuffer buffer = new StringBuffer(new String(buf)); - + mInputPtr = ptr; - value = resolveCharEnt(buffer, false); + value = resolveCharEnt(null, false); ptr = mInputPtr; c = buf[ptr - 1]; final boolean isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; - - /* If resolving entity surrogate pairs enabled and if current entity - * is in range of high surrogate value, try to find surrogate pair - */ - if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities() - && c == ';' && ptr + 1 < inputLen) { - c = buf[ptr++]; - - if (c == '&' && ptr + 1 < inputLen) { + + // If resolving entity surrogate pairs enabled and if current entity + // is in range of high surrogate value, try to find surrogate pair + if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities()) { + if (c == ';' && ptr + 1 < inputLen) { c = buf[ptr++]; - if (c == '#' && ptr + 1 < inputLen) { - try { - mInputPtr = ptr; - pairValue = resolveCharEnt(buffer, false); - ptr = mInputPtr; - c = buf[ptr -1]; - } catch(WstxUnexpectedCharException wuce) { + if (c == '&' && ptr + 1 < inputLen) { + c = buf[ptr++]; + + if (c == '#' && ptr + 1 < inputLen) { + try { + mInputPtr = ptr; + pairValue = resolveCharEnt(null, false); + ptr = mInputPtr; + c = buf[ptr -1]; + } catch(WstxUnexpectedCharException wuce) { + reportNoSurrogatePair(value); + } + } else { reportNoSurrogatePair(value); } } else { @@ -1223,27 +1224,19 @@ protected int resolveSimpleEntity(boolean checkStd) } else { reportNoSurrogatePair(value); } - } else if (isValueHighSurrogate - && mConfig.allowsSurrogatePairEntities() - && ptr + 1 >= inputLen) { - reportNoSurrogatePair(value); } - - /* We get here either if we got it all, OR if we ran out of - * input in current buffer. - */ + + // We get here either if we got it all, OR if we ran out of + // input in current buffer. if (c == ';') { // got the full thing mInputPtr = ptr; if (mConfig.allowsSurrogatePairEntities() && pairValue > 0) { - /* - * [woodstox-core#165] - * If pair value is not in range of low surrogate values, then throw an error - */ + // [woodstox-core#165] + // If pair value is not in range of low surrogate values, then throw an error if (pairValue < 0xDC00 || pairValue > 0xDFFF) { reportInvalidSurrogatePair(value, pairValue); } - value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00); } else { validateChar(value); @@ -1252,13 +1245,11 @@ protected int resolveSimpleEntity(boolean checkStd) return value; } - /* If we ran out of input, need to just fall back, gets - * resolved via 'full' resolution mechanism. - */ + // If we ran out of input, need to just fall back, gets + // resolved via 'full' resolution mechanism. } else if (checkStd) { - /* Caller may not want to resolve these quite yet... - * (when it wants separate events for non-char entities) - */ + // Caller may not want to resolve these quite yet... + // (when it wants separate events for non-char entities) if (c == 'a') { // amp or apos? c = buf[ptr++]; @@ -1549,10 +1540,9 @@ protected int fullyResolveEntity(boolean allowExt) // Perhaps we have a pre-defined char reference? c = id.charAt(0); - /* - * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override - * pre-defined settings for these? - */ + + // 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override + // pre-defined settings for these? char d = CHAR_NULL; if (c == 'a') { // amp or apos? if (id.equals("amp")) { From c57023db61444fe0a1cbf673b3cbf4c036d1e30d Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 6 Dec 2023 21:02:43 -0800 Subject: [PATCH 8/9] Further simplification --- src/main/java/com/ctc/wstx/sr/StreamScanner.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index fd8996c7..90a83f4d 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1183,6 +1183,7 @@ protected int resolveSimpleEntity(boolean checkStd) char[] buf = mInputBuffer; int ptr = mInputPtr; char c = buf[ptr++]; + final boolean allowSurrogatePairs = mConfig.allowsSurrogatePairEntities(); // Numeric reference? if (c == '#') { @@ -1194,25 +1195,21 @@ protected int resolveSimpleEntity(boolean checkStd) value = resolveCharEnt(null, false); ptr = mInputPtr; c = buf[ptr - 1]; - - final boolean isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF; // If resolving entity surrogate pairs enabled and if current entity // is in range of high surrogate value, try to find surrogate pair - if (isValueHighSurrogate && mConfig.allowsSurrogatePairEntities()) { + if (allowSurrogatePairs && value >= 0xD800 && value <= 0xDBFF) { if (c == ';' && ptr + 1 < inputLen) { c = buf[ptr++]; - if (c == '&' && ptr + 1 < inputLen) { c = buf[ptr++]; - if (c == '#' && ptr + 1 < inputLen) { try { mInputPtr = ptr; pairValue = resolveCharEnt(null, false); ptr = mInputPtr; c = buf[ptr -1]; - } catch(WstxUnexpectedCharException wuce) { + } catch (WstxUnexpectedCharException wuce) { reportNoSurrogatePair(value); } } else { @@ -1230,8 +1227,8 @@ protected int resolveSimpleEntity(boolean checkStd) // input in current buffer. if (c == ';') { // got the full thing mInputPtr = ptr; - - if (mConfig.allowsSurrogatePairEntities() && pairValue > 0) { + + if (allowSurrogatePairs && pairValue > 0) { // [woodstox-core#165] // If pair value is not in range of low surrogate values, then throw an error if (pairValue < 0xDC00 || pairValue > 0xDFFF) { From 88c5358142a74723173ccb71dc98177daa0c7de5 Mon Sep 17 00:00:00 2001 From: Magmaruss Date: Mon, 15 Jan 2024 22:38:30 +0100 Subject: [PATCH 9/9] Test case for allow surrogate pair entities option disabled --- .../stax/test/stream/TestEntityRead.java | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java index 31500040..1f91b9d6 100644 --- a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java +++ b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java @@ -499,6 +499,38 @@ public void testInvalidSurrogatePairEntities() } } + /** + * Test that ensures that an exception is thrown when + * allow surrogate pair entities option is disabled. + * Expected default behavior should be an exception + * with message starting with: Illegal character entity + */ + public void testAllowSurrogatePairEntitiesDisabled() + throws XMLStreamException + { + final String expectedErrorPhrase = "Illegal character entity"; + final String surrogatePairEntitiesXML = "surrogate pair: ��."; + + final XMLStreamReader sr = getReader(surrogatePairEntitiesXML, true, true, true, false); + assertTokenType(START_ELEMENT, sr.next()); + assertTokenType(CHARACTERS, sr.next()); + + try { + streamThrough(sr); + fail("Expected an exception for illegal character entity when surrogate pair entities allowation is disabled"); + } catch (XMLStreamException e) { + if (!e.getMessage().startsWith(expectedErrorPhrase)) { + fail(String.format( + "Expected an exception starting from phrase: '%s' when surrogate pair entities allowation is disabled, but the message was: '%s'", + expectedErrorPhrase, + e.getMessage() + )); + } + } finally { + sr.close(); + } + } + /** * This unit test checks that external entities can be resolved; and * to do that without requiring external files, will use a simple