just documentation/parameter names

apache · Feb 2, 2024 · 162e1f0 · 162e1f0
1 parent d83b90e
commit 162e1f0
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 17 deletions.
diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
@@ -47,8 +47,10 @@
 import org.xml.sax.SAXException;
 
 /**
- * This abstract class is a base class for other stream 
- * serializers (xml, html, text ...) that write output to a stream.
+ * This abstract class is a base class for other stream serializers
+ * (xml, html, text ...) that write output to a stream.  Note that
+ * this is stateful, NOT designed to be multithreaded; each thread and
+ * each output stream should have its own instance.
  * 
  * @xsl.usage internal
  */
@@ -61,7 +63,6 @@ abstract public class ToStream extends SerializerBase
     /** Stack to keep track of disabling output escaping. */
     protected BoolStack m_disableOutputEscapingStates = new BoolStack();
 
-
     /**
      * The encoding information associated with this serializer.
      * Although initially there is no encoding,
@@ -174,7 +175,40 @@ abstract public class ToStream extends SerializerBase
      * which is exiting older behavior.
      */
     private boolean m_expandDTDEntities = true;
-
+
+    /**
+     * Traditionally, we handled Surrogate Character Pairs by looking
+     * ahead in the input buffer. This could fail if, eg, the pair crossed
+     * between one call to characters() and the next, which can happen
+     * since SAX providers are free to manage buffering as they see fit
+     * and what the XML Data Model considers a single block of text
+     * may be delivered in multiple calls.
+     *
+     * The more robust solution is to maintain state, setting the High
+     * UTF16 Surrogate character aside and processing it when the Low
+     * Surrogate arrives.
+     *
+     * However, handling this robustly this requires recognizing, and
+     * handling, cases where a Surrogate appears but is not adjacent to
+     * the other half of the pair. That's illegal UTF16, but as utility
+     * code we can't guarantee some caller won't attempt it.
+     *
+     * Historically, we have handled this one of two ways, either
+     * generating an IOException with ER_INVALID_UTF18_SURROGATE or
+     * outputting the bad surrogate as a Numeric Character Reference
+     * (and possibly issuing a message to stderr, as in ToTextStream).
+     * The inconsistency annoys me a bit.  Only SGML-based formats
+     * support NCRs, and XML explicitly says that even an NCR may not
+     * represent an isolated surrogate.  Hence, for correctness, we AT
+     * LEAST want the stderr message, and arguably should be throwing
+     * the exception.  However, if we change any of this behavior we
+     * want to be able to revert to the prior response, in case some
+     * user is actually expecting to see that.
+     *
+     * Note that since we process char arrays, the "pending high surrogate"
+     * buffer is a char, with 0 used to indicate "empty buffer".
+     */
+    private char m_pendingUTF16HighSurrogate = 0;
 
     /**
      * Default constructor
@@ -959,67 +993,69 @@ protected boolean escapingNotNeeded(char ch)
     /**
      * Once a surrogate has been detected, write out the pair of
      * characters if it is in the encoding, or if there is no
-     * encoding, otherwise write out an entity reference
+     * encoding, otherwise write out an numeric character reference
      * of the value of the unicode code point of the character
      * represented by the high/low surrogate pair.
      * <p>
      * An exception is thrown if there is no low surrogate in the pair,
      * because the array ends unexpectely, or if the low char is there
      * but its value is such that it is not a low surrogate.
      *
-     * @param c the first (high) part of the surrogate, which
+     * @param high the first (high) part of the surrogate, which
      * must be confirmed before calling this method.
      * @param ch Character array.
      * @param i position Where the surrogate was detected.
      * @param end The end index of the significant characters.
      * @return 0 if the pair of characters was written out as-is,
      * the unicode code point of the character represented by
-     * the surrogate pair if an entity reference with that value
+     * the surrogate pair if a numeric char ref with that value
      * was written out. 
      * 
      * @throws IOException if  invalid UTF-16 surrogate detected.
      */
-    protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
+    protected int writeUTF16Surrogate(final char high, char ch[], int i, int end)
         throws IOException
     {
-        int codePoint = 0;
+	// THROWS if surrogate pair crosses input buffers
+	// Should probably handle this better.
         if (i + 1 >= end)
         {
             throw new IOException(
                 Utils.messages.createMessage(
                     MsgKey.ER_INVALID_UTF16_SURROGATE,
-                    new Object[] { Integer.toHexString((int) c)}));
+                    new Object[] { Integer.toHexString((int) high)}));
         }
 
-        final char high = c;
         final char low = ch[i+1];
         if (!Encodings.isLowUTF16Surrogate(low)) {
             throw new IOException(
                 Utils.messages.createMessage(
                     MsgKey.ER_INVALID_UTF16_SURROGATE,
                     new Object[] {
-                        Integer.toHexString((int) c)
+                        Integer.toHexString((int) high)
                             + " "
                             + Integer.toHexString(low)}));
         }
 
         final java.io.Writer writer = m_writer;
+        int codePoint = 0; // Nonzero iff written as NCR
 
         // If we make it to here we have a valid high, low surrogate pair
-        if (m_encodingInfo.isInEncoding(c,low)) {
+        if (m_encodingInfo.isInEncoding(high,low)) {
             // If the character formed by the surrogate pair
             // is in the encoding, so just write it out
+	    // NOTE: Assumes same buffer
             writer.write(ch,i,2);
         }
         else {
             // Don't know what to do with this char, it is
             // not in the encoding and not a high char in
-            // a surrogate pair, so write out as an entity ref
+            // a surrogate pair, so write out as a numeric char ref
             final String encoding = getEncoding();
             if (encoding != null) {
                 /* The output encoding is known, 
                  * so somthing is wrong.
-                  */
+                 */
                 codePoint = Encodings.toCodePoint(high, low);
                 // not in the encoding, so write out a character reference
                 writer.write('&');
@@ -1033,7 +1069,10 @@ protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
                 writer.write(ch, i, 2);
             }
         }
-        // non-zero only if character reference was written out.
+
+	// ToTextStream tests this and issues an error message (but
+	// not exception) if the not-in-encoding case arises,
+	// outputting an NCR in passing. 
         return codePoint;
     }
 

diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java
@@ -291,6 +291,7 @@ void writeNormalizedChars(
                 if (codePoint != 0) {
                     // I think we can just emit the message,
                     // not crash and burn.
+		    // Git commit ffb244aaa0f88368a0bf483bddc7e74d8a4d83bf?
                     final String integralValue = Integer.toString(codePoint);
                     final String msg = Utils.messages.createMessage(
                         MsgKey.ER_ILLEGAL_CHARACTER,
@@ -306,7 +307,7 @@ void writeNormalizedChars(
             } else {
                 // Don't know what to do with this char, it is
                 // not in the encoding and not a high char in
-                // a surrogate pair, so write out as an entity ref
+                // a surrogate pair, so write out as numeric char ref
                 if (encoding != null) {
                     /* The output encoding is known, 
                      * so somthing is wrong.