Skip to content

Commit

Permalink
wip on #1614
Browse files Browse the repository at this point in the history
This is more complicated than originally described because the file encoding isn't really described and java bytes include negative values.
EOF is triggered correctly, but it's also triggered on various potential characters which become negative values when they are truncated to byte.
It's unclear if this is meant to only read ASCII, ISO 8859-1, or UTF-8 but nothing outside of the ascii space works correctly.
  • Loading branch information
lbergelson committed Dec 2, 2022
1 parent f684576 commit e5ba94c
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 16 deletions.
11 changes: 6 additions & 5 deletions src/main/java/htsjdk/tribble/index/AbstractIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
Expand Down Expand Up @@ -302,10 +303,10 @@ private void writeHeader(final LittleEndianOutputStream dos) throws IOException
private void readHeader(final LittleEndianInputStream dis) throws IOException {

version = dis.readInt();
indexedPath = IOUtil.getPath(dis.readString());
indexedPath = IOUtil.getPath(dis.readString(StandardCharsets.US_ASCII));
indexedFileSize = dis.readLong();
indexedFileTS = dis.readLong();
indexedFileMD5 = dis.readString();
indexedFileMD5 = dis.readString(StandardCharsets.US_ASCII);
flags = dis.readInt();
if (version < 3 && (flags & SEQUENCE_DICTIONARY_FLAG) == SEQUENCE_DICTIONARY_FLAG) {
readSequenceDictionary(dis);
Expand All @@ -314,8 +315,8 @@ private void readHeader(final LittleEndianInputStream dis) throws IOException {
if (version >= 3) {
int nProperties = dis.readInt();
while (nProperties-- > 0) {
final String key = dis.readString();
final String value = dis.readString();
final String key = dis.readString(StandardCharsets.US_ASCII);
final String value = dis.readString(StandardCharsets.US_ASCII);
properties.put(key, value);
}
}
Expand All @@ -332,7 +333,7 @@ private void readSequenceDictionary(final LittleEndianInputStream dis) throws IO
final int size = dis.readInt();
if (size < 0) throw new IllegalStateException("Size of the sequence dictionary entries is negative");
for (int x = 0; x < size; x++) {
dis.readString();
dis.readString(StandardCharsets.US_ASCII);
dis.readInt();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -210,7 +211,7 @@ public void read(final LittleEndianInputStream dis) throws IOException {

tree = new IntervalTree();

name = dis.readString();
name = dis.readString(StandardCharsets.US_ASCII);
int nIntervals = dis.readInt();
while (nIntervals-- > 0) {

Expand Down
33 changes: 23 additions & 10 deletions src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;


/**
Expand Down Expand Up @@ -105,22 +107,33 @@ public final float readFloat() throws IOException {
}

/**
* Read a null terminated byte array and return result as a string
*
* @return
* @throws IOException
* Read a null terminated byte array and return result as a String
* This method decodes theh bytes as UTF-8 string
* @throws IOException if reading from the stream fails for some reason
* @throws EOFException if the stream ends without encountering a null terminator.
* @deprecated Prefer the {@link #readString(Charset)} which allows specifying a charset explicitly
*/

@Deprecated
public String readString() throws IOException {
ByteArrayOutputStream bis = new ByteArrayOutputStream(100);
byte b;
while ((b = (byte) in.read()) != 0) {
return readString(StandardCharsets.UTF_8);
}

/**
* Read a null terminated byte array and return result as a String
* @param charset the Charset to use when decoding the bytes to a String
* @throws IOException if reading from the stream fails for some reason
* @throws EOFException if the stream ends without encountering a null terminator.
*/
public String readString(final Charset charset) throws IOException {
final ByteArrayOutputStream bis = new ByteArrayOutputStream(100);
int b;
while ((b = in.read()) != 0) {
if(b < 0) {
throw new EOFException();
}
bis.write(b);
bis.write((byte)b);
}
return new String(bis.toByteArray());
return bis.toString(charset.name());
}


Expand Down
48 changes: 48 additions & 0 deletions src/test/java/htsjdk/testutil/Expected.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package htsjdk.testutil;

import org.testng.Assert;

public interface Expected<T> {
void test(ThrowingSupplier<T> functionToTest);


interface ThrowingConsumer<T> {
void test(T a) throws Exception;
}

static <T> Expected<T> match(final T expected) {
return new ComparisonExpected<>((T actual) -> Assert.assertEquals(actual, expected));
}

static <T> Expected<T> mismatch(final T expected) {
return new ComparisonExpected<>((T actual) -> Assert.assertNotEquals(actual, expected));
}

static <T> Expected<T> exception(final Class<? extends Exception> exceptionClass) {
return functionToTest -> Assert.assertThrows(exceptionClass, functionToTest::produce);
}

interface ThrowingSupplier<T> {
T produce() throws Exception;
}
}

final class ComparisonExpected<T> implements Expected<T> {
private final ThrowingConsumer<T> test;

@Override
public void test(ThrowingSupplier<T> supplier) {
try {
test.test(supplier.produce());
} catch (AssertionError e) {
throw e;
} catch (Exception e) {
throw new AssertionError(e);
}
}

ComparisonExpected(ThrowingConsumer<T> test) {
this.test = test;
}

}
45 changes: 45 additions & 0 deletions src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package htsjdk.tribble.util;

import htsjdk.HtsjdkTest;
import htsjdk.testutil.Expected;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;

public class LittleEndianInputStreamTest extends HtsjdkTest {


@DataProvider
public Object[][] testCases() {
final String missingTerminator = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_no_terminator.bin";
final String extendedAsciiFile = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_and_null_terminator.bin";
final Object utf8File = "src/test/resources/htsjdk/tribble/util/string_with_utf8_emoji_and_null_terminator.txt";
return new Object[][]{
{missingTerminator, StandardCharsets.ISO_8859_1, Expected.exception(EOFException.class)},
{missingTerminator, StandardCharsets.US_ASCII, Expected.exception(EOFException.class)},
{missingTerminator, StandardCharsets.UTF_8, Expected.exception(EOFException.class)},
{extendedAsciiFile, StandardCharsets.ISO_8859_1, Expected.match("very dràààààmatic and null terminated")},
{extendedAsciiFile, StandardCharsets.US_ASCII, Expected.mismatch("very dràààààmatic and null terminated")},
{extendedAsciiFile, StandardCharsets.UTF_8, Expected.mismatch("very dràààààmatic and null terminated")},
{utf8File, StandardCharsets.UTF_8, Expected.match("🐋 UTF8 is Great 🐋")},
{utf8File, StandardCharsets.ISO_8859_1, Expected.mismatch("🐋 UTF8 is Great 🐋")}
};
}

@Test(dataProvider = "testCases")
public void testAllCases(String filename, Charset charset, Expected<String> expected) {
expected.test(() -> {
try(final LittleEndianInputStream in = new LittleEndianInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(filename))))){
return in.readString(charset);
}
});
}

}

0 comments on commit e5ba94c

Please sign in to comment.