Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to optionally allow surrogate pair entities (#165) #174

Merged
merged 13 commits into from
Jan 16, 2024
Merged
5 changes: 5 additions & 0 deletions release-notes/CREDITS
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,8 @@ Tim Martin (@Orbisman)

* Contributed fix for #67: Wrong line for XML event location in elements following DTD
(6.6.0)

Kamil Gołębiewski (@Magmaruss)

* Contributed #165: Add support to optionally allow surrogate pair entities
(6.6.0)
2 changes: 2 additions & 0 deletions release-notes/VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Project: woodstox
#67: Wrong line for XML event location in elements following DTD
(reported by @m-g-sonar)
(fix contributed by Tim M)
#165: Add support to optionally allow surrogate pair entities
(contributed by Kamil G)
#176: Fix parser when not replacing entities and treating char references
as entities
(contributed by Guillaume N)
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/com/ctc/wstx/api/ReaderConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ public final class ReaderConfig

final static int PROP_MAX_DTD_DEPTH = 69;

/**
* @since 6.6
*/
final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70;

/*
////////////////////////////////////////////////
// Limits for numeric properties
Expand Down Expand Up @@ -361,6 +366,8 @@ public final class ReaderConfig
PROP_UNDECLARED_ENTITY_RESOLVER);
sProperties.put(WstxInputProperties.P_BASE_URL,
PROP_BASE_URL);
sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES,
PROP_ALLOW_SURROGATE_PAIR_ENTITIES);
sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE,
PROP_INPUT_PARSING_MODE);
}
Expand Down Expand Up @@ -419,6 +426,13 @@ public final class ReaderConfig
*/
protected URL mBaseURL;

/**
* Whether to allow surrogate pairs as entities (2 code-points as one target character).
*
* @since 6.6
*/
protected boolean mAllowSurrogatePairEntities = false;

/**
* Parsing mode can be changed from the default xml compliant
* behavior to one of alternate modes (fragment processing,
Expand Down Expand Up @@ -583,6 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym)
rc.mMaxEntityDepth = mMaxEntityDepth;
rc.mMaxEntityCount = mMaxEntityCount;
rc.mMaxDtdDepth = mMaxDtdDepth;
rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities;
if (mSpecialProperties != null) {
int len = mSpecialProperties.length;
Object[] specProps = new Object[len];
Expand Down Expand Up @@ -792,6 +807,10 @@ public XMLResolver getUndeclaredEntityResolver() {

public URL getBaseURL() { return mBaseURL; }

public boolean allowsSurrogatePairEntities() {
return mAllowSurrogatePairEntities;
}

public WstxInputProperties.ParsingMode getInputParsingMode() {
return mParsingMode;
}
Expand Down Expand Up @@ -1074,6 +1093,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) {
}

public void setBaseURL(URL baseURL) { mBaseURL = baseURL; }

public void doAllowSurrogatePairEntities(boolean state) {
mAllowSurrogatePairEntities = state;
}

public void setInputParsingMode(WstxInputProperties.ParsingMode mode) {
mParsingMode = mode;
Expand Down Expand Up @@ -1533,6 +1556,8 @@ public Object getProperty(int id)
return getUndeclaredEntityResolver();
case PROP_BASE_URL:
return getBaseURL();
case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
return allowsSurrogatePairEntities();
case PROP_INPUT_PARSING_MODE:
return getInputParsingMode();

Expand Down Expand Up @@ -1757,6 +1782,10 @@ public boolean setProperty(String propName, int id, Object value)
setBaseURL(u);
}
break;

case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value));
break;

case PROP_INPUT_PARSING_MODE:
setInputParsingMode((WstxInputProperties.ParsingMode) value);
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/com/ctc/wstx/api/WstxInputProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,15 @@ public final class WstxInputProperties
* DTD subset).
*/
public final static String P_BASE_URL = "com.ctc.wstx.baseURL";

/**
* Property of type {@link java.lang.Boolean}, that will allow parsing
* high unicode characters written by surrogate pairs (2 code points)
* Default set as Boolean.FALSE, because it is not a standard behavior
*
* @since 6.6
*/
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved
public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities";

// // // Alternate parsing modes

Expand Down
103 changes: 60 additions & 43 deletions src/main/java/com/ctc/wstx/sr/StreamScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -1183,59 +1183,62 @@ protected int resolveSimpleEntity(boolean checkStd)
char[] buf = mInputBuffer;
int ptr = mInputPtr;
char c = buf[ptr++];
final boolean allowSurrogatePairs = mConfig.allowsSurrogatePairEntities();

// Numeric reference?
if (c == '#') {
c = buf[ptr++];
int value = 0;
int pairValue = 0;
int inputLen = mInputEnd;
if (c == 'x') { // hex
while (ptr < inputLen) {

mInputPtr = ptr;
value = resolveCharEnt(null, false);
ptr = mInputPtr;
c = buf[ptr - 1];

// If resolving entity surrogate pairs enabled and if current entity
// is in range of high surrogate value, try to find surrogate pair
if (allowSurrogatePairs && value >= 0xD800 && value <= 0xDBFF) {
if (c == ';' && ptr + 1 < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
value += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
if (c == '&' && ptr + 1 < inputLen) {
c = buf[ptr++];
if (c == '#' && ptr + 1 < inputLen) {
try {
mInputPtr = ptr;
pairValue = resolveCharEnt(null, false);
ptr = mInputPtr;
c = buf[ptr -1];
} catch (WstxUnexpectedCharException wuce) {
reportNoSurrogatePair(value);
}
} else {
reportNoSurrogatePair(value);
}
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
reportNoSurrogatePair(value);
}
if (ptr >= inputLen) {
break;
}
c = buf[ptr++];
} else {
reportNoSurrogatePair(value);
}
}

// We get here either if we got it all, OR if we ran out of
// input in current buffer.
if (c == ';') { // got the full thing
mInputPtr = ptr;
validateChar(value);

if (allowSurrogatePairs && pairValue > 0) {
// [woodstox-core#165]
// If pair value is not in range of low surrogate values, then throw an error
if (pairValue < 0xDC00 || pairValue > 0xDFFF) {
reportInvalidSurrogatePair(value, pairValue);
}
value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00);
} else {
validateChar(value);
}

return value;
}

Expand Down Expand Up @@ -1352,7 +1355,7 @@ protected int resolveCharOnlyEntity(boolean checkStd)
// A char reference?
if (c == '#') { // yup
++mInputPtr;
return resolveCharEnt(null);
return resolveCharEnt(null, true);
}

// nope... except may be a pre-def?
Expand Down Expand Up @@ -1518,7 +1521,7 @@ protected int fullyResolveEntity(boolean allowExt)
// Do we have a (numeric) character entity reference?
if (c == '#') { // numeric
final StringBuffer originalSurface = new StringBuffer("#");
int ch = resolveCharEnt(originalSurface);
int ch = resolveCharEnt(originalSurface, true);
if (mCfgTreatCharRefsAsEntities) {
final char[] originalChars = new char[originalSurface.length()];
originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
Expand Down Expand Up @@ -2314,7 +2317,7 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
///////////////////////////////////////////////////////////////////////
*/

private int resolveCharEnt(StringBuffer originalCharacters)
private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar)
throws XMLStreamException
{
int value = 0;
Expand Down Expand Up @@ -2369,7 +2372,9 @@ private int resolveCharEnt(StringBuffer originalCharacters)
}
}
}
validateChar(value);
if (validateChar) {
validateChar(value);
}
return value;
}

Expand Down Expand Up @@ -2455,7 +2460,19 @@ private void reportUnicodeOverflow()
private void reportIllegalChar(int value)
throws XMLStreamException
{
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null);
}

private void reportNoSurrogatePair(int highSurrogate)
throws XMLStreamException
{
throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null);
}

private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate)
throws XMLStreamException
{
throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate));
}

protected void verifyLimit(String type, long maxValue, long currentValue)
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/codehaus/stax/test/BaseStaxTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import javax.xml.stream.*;
import javax.xml.stream.events.XMLEvent;

import com.ctc.wstx.api.WstxInputProperties;

/* Latest updates:
*
* - 07-Sep-2007, TSa: Updating based on latest understanding of
Expand Down Expand Up @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s
return false;
}
}

protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state)
throws XMLStreamException
{
Boolean b = state ? Boolean.TRUE : Boolean.FALSE;
f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b);
assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES));
}

protected static void setResolver(XMLInputFactory f, XMLResolver resolver)
throws XMLStreamException
Expand Down
Loading