Skip to content

Commit

Permalink
refact: cleanup Char(Array|Sequence)InputStream
Browse files Browse the repository at this point in the history
  • Loading branch information
sebthom committed Aug 21, 2024
1 parent cc14a8d commit 47e6aba
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
Expand All @@ -26,13 +27,29 @@
abstract class AbstractCharsInputStream extends InputStream {

protected enum EncoderState {
ENCODING,
FLUSHING,
/**
* The {@link #encoder} is actively encoding characters into bytes. This is the
* initial state of the encoder.
*/
ENCODING, //

/**
* The {@link #encoder} has finished processing all characters and is now
* flushing any remaining bytes in its internal buffer.
*/
FLUSHING, //

/**
* The {@link #encoder} has completed both the encoding and flushing processes.
* No more data is left to be read from the encoder.
*/
DONE
}

protected static final char UNICODE_REPLACEMENT_CHAR = '\uFFFD';

/** 1024 surrogate character pairs */
protected static final int DEFAULT_BUFFER_SIZE = 1024;
protected static final int DEFAULT_BUFFER_SIZE = 512;
protected static final int CHAR_BUFFER_MULTIPLIER = 2; // 2 chars for one high/low surrogate character pair
protected static final int BYTE_BUFFER_MULTIPLIER = 4; // 4 bytes for one UTF character (up to 4 bytes)

Expand Down Expand Up @@ -75,6 +92,32 @@ protected AbstractCharsInputStream(final int bufferSize) {
@Override
public abstract int available();

/**
* This method is called by {@link #refillByteBuffer()} to encode characters
* from the given {@link CharBuffer} into bytes and stores them in the
* {@link #byteBuffer}.
*
* <p>
* The method can be used either to encode characters in the middle of input
* (with {@code isEndOfInput=false}) or to finalize the encoding process at the
* end of input (with {@code isEndOfInput=true}).
* </p>
*
* @param in
* the {@link CharBuffer} containing characters to encode.
* @param isEndOfInput
* if {@code true}, signals that no more input will be provided,
* allowing the encoder to complete its final encoding steps.
*/
protected void encodeChars(final CharBuffer in, final boolean isEndOfInput) throws CharacterCodingException {
byteBuffer.clear();
final CoderResult result = encoder.encode(in, byteBuffer, isEndOfInput);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
}

protected boolean flushEncoder() throws IOException {
if (encoderState == EncoderState.DONE)
return false;
Expand All @@ -88,8 +131,12 @@ protected boolean flushEncoder() throws IOException {
final CoderResult result = encoder.flush(byteBuffer);
byteBuffer.flip();

if (result.isOverflow()) // byteBuffer too small
if (result.isOverflow()) {
// the byteBuffer has been filled, but there are more bytes to be flushed.
// after reading all available bytes from byteBuffer, flushEncoder() needs to
// be called again to process the remaining data.
return true;
}

if (result.isError()) {
result.throwException();
Expand All @@ -116,7 +163,7 @@ public boolean markSupported() {

@Override
public int read() throws IOException {
if (!byteBuffer.hasRemaining() && !refillBuffer())
if (!byteBuffer.hasRemaining() && !refillByteBuffer())
return IOUtils.EOF;
return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
}
Expand All @@ -132,7 +179,7 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I

while (bytesRead < bytesToRead) {
if (bytesReadable == 0) {
if (refillBuffer()) {
if (refillByteBuffer()) {
bytesReadable = byteBuffer.remaining();
} else
return bytesRead == 0 ? IOUtils.EOF : bytesRead;
Expand All @@ -147,7 +194,16 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I
return bytesRead;
}

protected abstract boolean refillBuffer() throws IOException;
/**
* Refills the {@link #byteBuffer} by reading characters from the character
* supplier, encoding them, and storing the resulting bytes into the
* {@link #byteBuffer}.
*
* @return {@code true} if the buffer was successfully refilled and has bytes
* available for reading, {@code false} if the end of the stream is
* reached and there are no more bytes to read.
*/
protected abstract boolean refillByteBuffer() throws IOException;

@Override
public synchronized void reset() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.io.IOException;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CoderResult;

/**
* @author <a href="https://sebthom.de/">Sebastian Thomschke</a>
Expand Down Expand Up @@ -46,7 +45,7 @@ public int available() {
}

@Override
protected boolean refillBuffer() throws IOException {
protected boolean refillByteBuffer() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

Expand All @@ -58,12 +57,7 @@ protected boolean refillBuffer() throws IOException {
// if EOF is reached transition to flushing
if (charIndex >= charsLen) {
// finalize encoding before switching to flushing
byteBuffer.clear();
final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
encodeChars(CharBuffer.allocate(0), true /* signal EOF */);
return flushEncoder();
}

Expand All @@ -80,11 +74,11 @@ protected boolean refillBuffer() throws IOException {
charBuffer.put(lowSurrogate);
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
charBuffer.put(UNICODE_REPLACEMENT_CHAR);
}
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
charBuffer.put(UNICODE_REPLACEMENT_CHAR);
break;
}
} else {
Expand All @@ -94,12 +88,7 @@ protected boolean refillBuffer() throws IOException {
charBuffer.flip();

// encode chars into bytes
byteBuffer.clear();
final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
encodeChars(charBuffer, false);
} catch (final RuntimeException ex) {
throw new IOException(ex);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.io.IOException;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CoderResult;
import java.util.List;
import java.util.function.IntSupplier;

Expand All @@ -18,6 +17,10 @@
*/
public class CharSequenceInputStream extends AbstractCharsInputStream {

/**
* Functional interface for supplying characters at a specified index.
* Implementations can define how characters are fetched.
*/
@FunctionalInterface
public interface CharsSupplier {
char charAt(int index) throws Exception;
Expand Down Expand Up @@ -133,7 +136,7 @@ public int available() {
}

@Override
protected boolean refillBuffer() throws IOException {
protected boolean refillByteBuffer() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

Expand All @@ -145,12 +148,7 @@ protected boolean refillBuffer() throws IOException {
// if EOF is reached transition to flushing
if (charIndex >= charsLen) {
// finalize encoding before switching to flushing
byteBuffer.clear();
final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
encodeChars(CharBuffer.allocate(0), true /* signal EOF */);
return flushEncoder();
}

Expand All @@ -167,11 +165,11 @@ protected boolean refillBuffer() throws IOException {
charBuffer.put(lowSurrogate);
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
charBuffer.put(UNICODE_REPLACEMENT_CHAR);
}
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
charBuffer.put(UNICODE_REPLACEMENT_CHAR);
break;
}
} else {
Expand All @@ -181,12 +179,7 @@ protected boolean refillBuffer() throws IOException {
charBuffer.flip();

// encode chars into bytes
byteBuffer.clear();
final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
encodeChars(charBuffer, false);
} catch (final Exception ex) {
throw new IOException(ex);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public class CharArrayInputStreamTest {
public void testAvailable() throws IOException {
try (var is = new CharArrayInputStream(TEST_ASCII.toCharArray())) {
assertThat(is.available()).isEqualTo(TEST_ASCII.length());
final byte[] buffer = new byte[4];
final var buffer = new byte[4];
is.read(buffer);
assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4);
is.readAllBytes();
Expand Down Expand Up @@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException {
bytesRead.add((byte) b);
}

final byte[] byteArray = new byte[bytesRead.size()];
final var byteArray = new byte[bytesRead.size()];
for (int i = 0; i < bytesRead.size(); i++) {
byteArray[i] = bytesRead.get(i);
}
Expand All @@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException {

@Test
public void testReadIntoByteArray() throws IOException {
final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
final var buffer = new byte[1024]; // Buffer to read a portion of the text

try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) {
final int bytesRead = is.read(buffer, 0, buffer.length);
Expand All @@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException {
@Test
public void testResetWithoutMark() throws IOException {
try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) {
final byte[] buffer = new byte[EMOJI_BYTES_LEN];
final var buffer = new byte[EMOJI_BYTES_LEN];

// read the first few bytes (the emoji)
assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN);
Expand All @@ -130,7 +130,7 @@ public void testSkip() throws IOException {
final long skipped = is.skip(EMOJI_BYTES_LEN);
assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN);

final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
final var japanese = new byte[TEST_UNICODE_BYTES_LEN];
final int bytesRead = is.read(japanese);

assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE);
Expand All @@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException {
final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate
try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) {
final byte[] result = is.readAllBytes();
final String output = new String(result, UTF_8);
final var output = new String(result, UTF_8);

// the high surrogate at the end should be replaced by the Unicode replacement char
assertThat(output).isEqualTo("A" + "\uFFFD");
Expand All @@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException {
final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A'
try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) {
final byte[] result = is.readAllBytes();
final String output = new String(result, UTF_8);
final var output = new String(result, UTF_8);

// the invalid surrogate pair should be replaced by the Unicode replacement char
assertThat(output).isEqualTo("\uFFFD" + "A");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public class CharSequenceInputStreamTest {
public void testAvailable() throws IOException {
try (var is = new CharSequenceInputStream(TEST_ASCII)) {
assertThat(is.available()).isEqualTo(TEST_ASCII.length());
final byte[] buffer = new byte[4];
final var buffer = new byte[4];
is.read(buffer);
assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4);
is.readAllBytes();
Expand Down Expand Up @@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException {
bytesRead.add((byte) b);
}

final byte[] byteArray = new byte[bytesRead.size()];
final var byteArray = new byte[bytesRead.size()];
for (int i = 0; i < bytesRead.size(); i++) {
byteArray[i] = bytesRead.get(i);
}
Expand All @@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException {

@Test
public void testReadIntoByteArray() throws IOException {
final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
final var buffer = new byte[1024]; // Buffer to read a portion of the text

try (var is = new CharSequenceInputStream(TEST_UNICODE)) {
final int bytesRead = is.read(buffer, 0, buffer.length);
Expand All @@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException {
@Test
public void testResetWithoutMark() throws IOException {
try (var is = new CharSequenceInputStream(TEST_UNICODE)) {
final byte[] buffer = new byte[EMOJI_BYTES_LEN];
final var buffer = new byte[EMOJI_BYTES_LEN];

// read the first few bytes (the emoji)
assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN);
Expand All @@ -130,7 +130,7 @@ public void testSkip() throws IOException {
final long skipped = is.skip(EMOJI_BYTES_LEN);
assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN);

final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
final var japanese = new byte[TEST_UNICODE_BYTES_LEN];
final int bytesRead = is.read(japanese);

assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE);
Expand All @@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException {
final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate
try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) {
final byte[] result = is.readAllBytes();
final String output = new String(result, UTF_8);
final var output = new String(result, UTF_8);

// the high surrogate at the end should be replaced by the Unicode replacement char
assertThat(output).isEqualTo("A" + "\uFFFD");
Expand All @@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException {
final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A'
try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) {
final byte[] result = is.readAllBytes();
final String output = new String(result, UTF_8);
final var output = new String(result, UTF_8);

// the invalid surrogate pair should be replaced by the Unicode replacement char
assertThat(output).isEqualTo("\uFFFD" + "A");
Expand Down

0 comments on commit 47e6aba

Please sign in to comment.