refact: cleanup Char(Array|Sequence)InputStream

sebthom · Aug 21, 2024 · 47e6aba · 47e6aba
1 parent cc14a8d
commit 47e6aba
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 53 deletions.
diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java
@@ -10,6 +10,7 @@
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
@@ -26,13 +27,29 @@
 abstract class AbstractCharsInputStream extends InputStream {
 
    protected enum EncoderState {
-      ENCODING,
-      FLUSHING,
+      /**
+       * The {@link #encoder} is actively encoding characters into bytes. This is the
+       * initial state of the encoder.
+       */
+      ENCODING, //
+
+      /**
+       * The {@link #encoder} has finished processing all characters and is now
+       * flushing any remaining bytes in its internal buffer.
+       */
+      FLUSHING, //
+
+      /**
+       * The {@link #encoder} has completed both the encoding and flushing processes.
+       * No more data is left to be read from the encoder.
+       */
       DONE
    }
 
+   protected static final char UNICODE_REPLACEMENT_CHAR = '\uFFFD';
+
    /** 1024 surrogate character pairs */
-   protected static final int DEFAULT_BUFFER_SIZE = 1024;
+   protected static final int DEFAULT_BUFFER_SIZE = 512;
    protected static final int CHAR_BUFFER_MULTIPLIER = 2; // 2 chars for one high/low surrogate character pair
    protected static final int BYTE_BUFFER_MULTIPLIER = 4; // 4 bytes for one UTF character (up to 4 bytes)
 
@@ -75,6 +92,32 @@ protected AbstractCharsInputStream(final int bufferSize) {
    @Override
    public abstract int available();
 
+   /**
+    * This method is called by {@link #refillByteBuffer()} to encode characters
+    * from the given {@link CharBuffer} into bytes and stores them in the
+    * {@link #byteBuffer}.
+    *
+    * <p>
+    * The method can be used either to encode characters in the middle of input
+    * (with {@code isEndOfInput=false}) or to finalize the encoding process at the
+    * end of input (with {@code isEndOfInput=true}).
+    * </p>
+    *
+    * @param in
+    *           the {@link CharBuffer} containing characters to encode.
+    * @param isEndOfInput
+    *           if {@code true}, signals that no more input will be provided,
+    *           allowing the encoder to complete its final encoding steps.
+    */
+   protected void encodeChars(final CharBuffer in, final boolean isEndOfInput) throws CharacterCodingException {
+      byteBuffer.clear();
+      final CoderResult result = encoder.encode(in, byteBuffer, isEndOfInput);
+      byteBuffer.flip();
+      if (result.isError()) {
+         result.throwException();
+      }
+   }
+
    protected boolean flushEncoder() throws IOException {
       if (encoderState == EncoderState.DONE)
          return false;
@@ -88,8 +131,12 @@ protected boolean flushEncoder() throws IOException {
       final CoderResult result = encoder.flush(byteBuffer);
       byteBuffer.flip();
 
-      if (result.isOverflow()) // byteBuffer too small
+      if (result.isOverflow()) {
+         // the byteBuffer has been filled, but there are more bytes to be flushed.
+         // after reading all available bytes from byteBuffer, flushEncoder() needs to
+         // be called again to process the remaining data.
          return true;
+      }
 
       if (result.isError()) {
          result.throwException();
@@ -116,7 +163,7 @@ public boolean markSupported() {
 
    @Override
    public int read() throws IOException {
-      if (!byteBuffer.hasRemaining() && !refillBuffer())
+      if (!byteBuffer.hasRemaining() && !refillByteBuffer())
          return IOUtils.EOF;
       return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
    }
@@ -132,7 +179,7 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I
 
       while (bytesRead < bytesToRead) {
          if (bytesReadable == 0) {
-            if (refillBuffer()) {
+            if (refillByteBuffer()) {
                bytesReadable = byteBuffer.remaining();
             } else
                return bytesRead == 0 ? IOUtils.EOF : bytesRead;
@@ -147,7 +194,16 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I
       return bytesRead;
    }
 
-   protected abstract boolean refillBuffer() throws IOException;
+   /**
+    * Refills the {@link #byteBuffer} by reading characters from the character
+    * supplier, encoding them, and storing the resulting bytes into the
+    * {@link #byteBuffer}.
+    *
+    * @return {@code true} if the buffer was successfully refilled and has bytes
+    *         available for reading, {@code false} if the end of the stream is
+    *         reached and there are no more bytes to read.
+    */
+   protected abstract boolean refillByteBuffer() throws IOException;
 
    @Override
    public synchronized void reset() throws IOException {

diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java
@@ -7,7 +7,6 @@
 import java.io.IOException;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
-import java.nio.charset.CoderResult;
 
 /**
  * @author <a href="https://sebthom.de/">Sebastian Thomschke</a>
@@ -46,7 +45,7 @@ public int available() {
    }
 
    @Override
-   protected boolean refillBuffer() throws IOException {
+   protected boolean refillByteBuffer() throws IOException {
       if (encoderState == EncoderState.DONE)
          return false;
 
@@ -58,12 +57,7 @@ protected boolean refillBuffer() throws IOException {
       // if EOF is reached transition to flushing
       if (charIndex >= charsLen) {
          // finalize encoding before switching to flushing
-         byteBuffer.clear();
-         final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
-         byteBuffer.flip();
-         if (result.isError()) {
-            result.throwException();
-         }
+         encodeChars(CharBuffer.allocate(0), true /* signal EOF */);
          return flushEncoder();
       }
 
@@ -80,11 +74,11 @@ protected boolean refillBuffer() throws IOException {
                      charBuffer.put(lowSurrogate);
                   } else {
                      // missing low surrogate - fallback to replacement character
-                     charBuffer.put('\uFFFD');
+                     charBuffer.put(UNICODE_REPLACEMENT_CHAR);
                   }
                } else {
                   // missing low surrogate - fallback to replacement character
-                  charBuffer.put('\uFFFD');
+                  charBuffer.put(UNICODE_REPLACEMENT_CHAR);
                   break;
                }
             } else {
@@ -94,12 +88,7 @@ protected boolean refillBuffer() throws IOException {
          charBuffer.flip();
 
          // encode chars into bytes
-         byteBuffer.clear();
-         final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
-         byteBuffer.flip();
-         if (result.isError()) {
-            result.throwException();
-         }
+         encodeChars(charBuffer, false);
       } catch (final RuntimeException ex) {
          throw new IOException(ex);
       }

diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java
@@ -7,7 +7,6 @@
 import java.io.IOException;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
-import java.nio.charset.CoderResult;
 import java.util.List;
 import java.util.function.IntSupplier;
 
@@ -18,6 +17,10 @@
  */
 public class CharSequenceInputStream extends AbstractCharsInputStream {
 
+   /**
+    * Functional interface for supplying characters at a specified index.
+    * Implementations can define how characters are fetched.
+    */
    @FunctionalInterface
    public interface CharsSupplier {
       char charAt(int index) throws Exception;
@@ -133,7 +136,7 @@ public int available() {
    }
 
    @Override
-   protected boolean refillBuffer() throws IOException {
+   protected boolean refillByteBuffer() throws IOException {
       if (encoderState == EncoderState.DONE)
          return false;
 
@@ -145,12 +148,7 @@ protected boolean refillBuffer() throws IOException {
       // if EOF is reached transition to flushing
       if (charIndex >= charsLen) {
          // finalize encoding before switching to flushing
-         byteBuffer.clear();
-         final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
-         byteBuffer.flip();
-         if (result.isError()) {
-            result.throwException();
-         }
+         encodeChars(CharBuffer.allocate(0), true /* signal EOF */);
          return flushEncoder();
       }
 
@@ -167,11 +165,11 @@ protected boolean refillBuffer() throws IOException {
                      charBuffer.put(lowSurrogate);
                   } else {
                      // missing low surrogate - fallback to replacement character
-                     charBuffer.put('\uFFFD');
+                     charBuffer.put(UNICODE_REPLACEMENT_CHAR);
                   }
                } else {
                   // missing low surrogate - fallback to replacement character
-                  charBuffer.put('\uFFFD');
+                  charBuffer.put(UNICODE_REPLACEMENT_CHAR);
                   break;
                }
             } else {
@@ -181,12 +179,7 @@ protected boolean refillBuffer() throws IOException {
          charBuffer.flip();
 
          // encode chars into bytes
-         byteBuffer.clear();
-         final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
-         byteBuffer.flip();
-         if (result.isError()) {
-            result.throwException();
-         }
+         encodeChars(charBuffer, false);
       } catch (final Exception ex) {
          throw new IOException(ex);
       }

diff --git a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java
@@ -29,7 +29,7 @@ public class CharArrayInputStreamTest {
    public void testAvailable() throws IOException {
       try (var is = new CharArrayInputStream(TEST_ASCII.toCharArray())) {
          assertThat(is.available()).isEqualTo(TEST_ASCII.length());
-         final byte[] buffer = new byte[4];
+         final var buffer = new byte[4];
          is.read(buffer);
          assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4);
          is.readAllBytes();
@@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException {
             bytesRead.add((byte) b);
          }
 
-         final byte[] byteArray = new byte[bytesRead.size()];
+         final var byteArray = new byte[bytesRead.size()];
          for (int i = 0; i < bytesRead.size(); i++) {
             byteArray[i] = bytesRead.get(i);
          }
@@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException {
 
    @Test
    public void testReadIntoByteArray() throws IOException {
-      final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
+      final var buffer = new byte[1024]; // Buffer to read a portion of the text
 
       try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) {
          final int bytesRead = is.read(buffer, 0, buffer.length);
@@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException {
    @Test
    public void testResetWithoutMark() throws IOException {
       try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) {
-         final byte[] buffer = new byte[EMOJI_BYTES_LEN];
+         final var buffer = new byte[EMOJI_BYTES_LEN];
 
          // read the first few bytes (the emoji)
          assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN);
@@ -130,7 +130,7 @@ public void testSkip() throws IOException {
          final long skipped = is.skip(EMOJI_BYTES_LEN);
          assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN);
 
-         final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
+         final var japanese = new byte[TEST_UNICODE_BYTES_LEN];
          final int bytesRead = is.read(japanese);
 
          assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE);
@@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException {
       final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate
       try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) {
          final byte[] result = is.readAllBytes();
-         final String output = new String(result, UTF_8);
+         final var output = new String(result, UTF_8);
 
          // the high surrogate at the end should be replaced by the Unicode replacement char
          assertThat(output).isEqualTo("A" + "\uFFFD");
@@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException {
       final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A'
       try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) {
          final byte[] result = is.readAllBytes();
-         final String output = new String(result, UTF_8);
+         final var output = new String(result, UTF_8);
 
          // the invalid surrogate pair should be replaced by the Unicode replacement char
          assertThat(output).isEqualTo("\uFFFD" + "A");

diff --git a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java
@@ -29,7 +29,7 @@ public class CharSequenceInputStreamTest {
    public void testAvailable() throws IOException {
       try (var is = new CharSequenceInputStream(TEST_ASCII)) {
          assertThat(is.available()).isEqualTo(TEST_ASCII.length());
-         final byte[] buffer = new byte[4];
+         final var buffer = new byte[4];
          is.read(buffer);
          assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4);
          is.readAllBytes();
@@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException {
             bytesRead.add((byte) b);
          }
 
-         final byte[] byteArray = new byte[bytesRead.size()];
+         final var byteArray = new byte[bytesRead.size()];
          for (int i = 0; i < bytesRead.size(); i++) {
             byteArray[i] = bytesRead.get(i);
          }
@@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException {
 
    @Test
    public void testReadIntoByteArray() throws IOException {
-      final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
+      final var buffer = new byte[1024]; // Buffer to read a portion of the text
 
       try (var is = new CharSequenceInputStream(TEST_UNICODE)) {
          final int bytesRead = is.read(buffer, 0, buffer.length);
@@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException {
    @Test
    public void testResetWithoutMark() throws IOException {
       try (var is = new CharSequenceInputStream(TEST_UNICODE)) {
-         final byte[] buffer = new byte[EMOJI_BYTES_LEN];
+         final var buffer = new byte[EMOJI_BYTES_LEN];
 
          // read the first few bytes (the emoji)
          assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN);
@@ -130,7 +130,7 @@ public void testSkip() throws IOException {
          final long skipped = is.skip(EMOJI_BYTES_LEN);
          assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN);
 
-         final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
+         final var japanese = new byte[TEST_UNICODE_BYTES_LEN];
          final int bytesRead = is.read(japanese);
 
          assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE);
@@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException {
       final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate
       try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) {
          final byte[] result = is.readAllBytes();
-         final String output = new String(result, UTF_8);
+         final var output = new String(result, UTF_8);
 
          // the high surrogate at the end should be replaced by the Unicode replacement char
          assertThat(output).isEqualTo("A" + "\uFFFD");
@@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException {
       final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A'
       try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) {
          final byte[] result = is.readAllBytes();
-         final String output = new String(result, UTF_8);
+         final var output = new String(result, UTF_8);
 
          // the invalid surrogate pair should be replaced by the Unicode replacement char
          assertThat(output).isEqualTo("\uFFFD" + "A");