diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java index 6b6b626..8735230 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java @@ -177,77 +177,77 @@ public boolean nextKeyValue() throws IOException { vc.set(v); return true; } -} - -class BGZFLimitingStream extends InputStream { - private final BlockCompressedInputStream bgzf; - private final long virtEnd; - private byte[] readBuf = new byte[1]; - - public BGZFLimitingStream(BlockCompressedInputStream stream, long virtualEnd) { - bgzf = stream; - virtEnd = virtualEnd; - } + static class BGZFLimitingStream extends InputStream { - @Override - public void close() throws IOException { - bgzf.close(); - } + private final BlockCompressedInputStream bgzf; + private final long virtEnd; + private byte[] readBuf = new byte[1]; - @Override - public int read() throws IOException { - switch (read(readBuf)) { - case 1: - return readBuf[0]; - case -1: - return -1; - default: - assert false; - return -1; + public BGZFLimitingStream(BlockCompressedInputStream stream, long virtualEnd) { + bgzf = stream; + virtEnd = virtualEnd; } - } - @Override - public int read(byte[] buf, int off, int len) throws IOException { - - int totalRead = 0; - long virt; - - final int lastLen = (int) virtEnd & 0xffff; - - while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { - // We're not in the last BGZF block yet. Unfortunately - // BlockCompressedInputStream doesn't expose the length of the current - // block, so we can't simply (possibly repeatedly) read the current - // block to the end. Instead, we read at most virtEnd & 0xffff at a - // time, which ensures that we can't overshoot virtEnd even if the - // next block starts immediately. - final int r = bgzf.read(buf, off, Math.min(len, lastLen)); - if (r == -1) { - return totalRead == 0 ? -1 : totalRead; - } + @Override + public void close() throws IOException { + bgzf.close(); + } - totalRead += r; - len -= r; - if (len == 0) { - return totalRead; + @Override + public int read() throws IOException { + switch (read(readBuf)) { + case 1: + return readBuf[0]; + case -1: + return -1; + default: + assert false; + return -1; } - off += r; } - // We're in the last BGZF block: read only up to lastLen. - len = Math.min(len, ((int) virt & 0xffff) - lastLen); - while (len > 0) { - final int r = bgzf.read(buf, off, len); - if (r == -1) { - return totalRead == 0 ? -1 : totalRead; + @Override + public int read(byte[] buf, int off, int len) throws IOException { + + int totalRead = 0; + long virt; + + final int lastLen = (int) virtEnd & 0xffff; + + while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { + // We're not in the last BGZF block yet. Unfortunately + // BlockCompressedInputStream doesn't expose the length of the current + // block, so we can't simply (possibly repeatedly) read the current + // block to the end. Instead, we read at most virtEnd & 0xffff at a + // time, which ensures that we can't overshoot virtEnd even if the + // next block starts immediately. + final int r = bgzf.read(buf, off, Math.min(len, lastLen)); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + if (len == 0) { + return totalRead; + } + off += r; } - totalRead += r; - len -= r; - off += r; + // We're in the last BGZF block: read only up to lastLen. + len = Math.min(len, ((int) virt & 0xffff) - lastLen); + while (len > 0) { + final int r = bgzf.read(buf, off, len); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + off += r; + } + return totalRead == 0 ? -1 : totalRead; } - return totalRead == 0 ? -1 : totalRead; } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java index 0ec2664..6bb4327 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java @@ -122,57 +122,57 @@ protected void writeRecord(VariantContext vc) { writer.add(vc); } -} - -// We must always call writer.writeHeader() because the writer requires -// the header in writer.add(), and writeHeader() is the only way to give -// the header to the writer. Thus, we use this class to simply throw away -// output until after the header's been written. -// -// This is, of course, a HACK and a slightly dangerous one: if writer -// does any buffering of its own and doesn't flush after writing the -// header, this isn't as easy as this. -// -// In addition we do BGZF compression here, to simplify things. -final class BCFStoppableOutputStream extends FilterOutputStream { - private final OutputStream origOut; - public boolean stopped; - - public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { - super(new BlockCompressedOutputStream(out, null)); - origOut = out; - stopped = startStopped; - } + // We must always call writer.writeHeader() because the writer requires + // the header in writer.add(), and writeHeader() is the only way to give + // the header to the writer. Thus, we use this class to simply throw away + // output until after the header's been written. + // + // This is, of course, a HACK and a slightly dangerous one: if writer + // does any buffering of its own and doesn't flush after writing the + // header, this isn't as easy as this. + // + // In addition we do BGZF compression here, to simplify things. + static final class BCFStoppableOutputStream extends FilterOutputStream { + + private final OutputStream origOut; + public boolean stopped; + + public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { + super(new BlockCompressedOutputStream(out, null)); + origOut = out; + stopped = startStopped; + } - @Override - public void write(int b) throws IOException { - if (!stopped) { - super.write(b); + @Override + public void write(int b) throws IOException { + if (!stopped) { + super.write(b); + } } - } - @Override - public void write(byte[] b) throws IOException { - if (!stopped) { - super.write(b); + @Override + public void write(byte[] b) throws IOException { + if (!stopped) { + super.write(b); + } } - } - @Override - public void write(byte[] b, int off, int len) throws IOException { - if (!stopped) { - super.write(b, off, len); + @Override + public void write(byte[] b, int off, int len) throws IOException { + if (!stopped) { + super.write(b, off, len); + } } - } - @Override - public void close() throws IOException { - // Don't close the BlockCompressedOutputStream, as we don't want - // the BGZF terminator. - this.out.flush(); + @Override + public void close() throws IOException { + // Don't close the BlockCompressedOutputStream, as we don't want + // the BGZF terminator. + this.out.flush(); - // Instead, close the lower-level output stream directly. - origOut.close(); + // Instead, close the lower-level output stream directly. + origOut.close(); + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java index 3a908b1..a1faf90 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java @@ -65,89 +65,89 @@ public BAMRecord createBAMRecord( insertSize, variableLengthBlock); } -} - -class LazyBAMRecord extends BAMRecord { - - private boolean decodedRefIdx = false; - private boolean decodedMateRefIdx = false; - public LazyBAMRecord( - SAMFileHeader hdr, - int referenceID, - int coordinate, - short readNameLength, - short mappingQuality, - int indexingBin, - int cigarLen, - int flags, - int readLen, - int mateReferenceID, - int mateCoordinate, - int insertSize, - byte[] restOfData) { - super( - hdr, - referenceID, - coordinate, - readNameLength, - mappingQuality, - indexingBin, - cigarLen, - flags, - readLen, - mateReferenceID, - mateCoordinate, - insertSize, - restOfData); - } + static class LazyBAMRecord extends BAMRecord { + + private boolean decodedRefIdx = false; + private boolean decodedMateRefIdx = false; + + public LazyBAMRecord( + SAMFileHeader hdr, + int referenceID, + int coordinate, + short readNameLength, + short mappingQuality, + int indexingBin, + int cigarLen, + int flags, + int readLen, + int mateReferenceID, + int mateCoordinate, + int insertSize, + byte[] restOfData) { + super( + hdr, + referenceID, + coordinate, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceID, + mateCoordinate, + insertSize, + restOfData); + } - @Override - public void setReferenceIndex(final int referenceIndex) { - mReferenceIndex = referenceIndex; - decodedRefIdx = false; - } + @Override + public void setReferenceIndex(final int referenceIndex) { + mReferenceIndex = referenceIndex; + decodedRefIdx = false; + } - @Override - public void setMateReferenceIndex(final int referenceIndex) { - mMateReferenceIndex = referenceIndex; - decodedMateRefIdx = false; - } + @Override + public void setMateReferenceIndex(final int referenceIndex) { + mMateReferenceIndex = referenceIndex; + decodedMateRefIdx = false; + } - @Override - public String getReferenceName() { - if (mReferenceIndex != null && !decodedRefIdx) { - decodedRefIdx = true; - super.setReferenceIndex(mReferenceIndex); + @Override + public String getReferenceName() { + if (mReferenceIndex != null && !decodedRefIdx) { + decodedRefIdx = true; + super.setReferenceIndex(mReferenceIndex); + } + return super.getReferenceName(); } - return super.getReferenceName(); - } - @Override - public String getMateReferenceName() { - if (mMateReferenceIndex != null && !decodedMateRefIdx) { - decodedMateRefIdx = true; - super.setMateReferenceIndex(mMateReferenceIndex); + @Override + public String getMateReferenceName() { + if (mMateReferenceIndex != null && !decodedMateRefIdx) { + decodedMateRefIdx = true; + super.setMateReferenceIndex(mMateReferenceIndex); + } + return super.getMateReferenceName(); } - return super.getMateReferenceName(); - } - @Override - protected void eagerDecode() { - getReferenceName(); - getMateReferenceName(); - super.eagerDecode(); - } + @Override + protected void eagerDecode() { + getReferenceName(); + getMateReferenceName(); + super.eagerDecode(); + } - @Override - public boolean equals(Object o) { - // don't use decoded flags for equality check - return super.equals(o); - } + @Override + public boolean equals(Object o) { + // don't use decoded flags for equality check + return super.equals(o); + } - @Override - public int hashCode() { - // don't use decoded flags for hash code - return super.hashCode(); + @Override + public int hashCode() { + // don't use decoded flags for hash code + return super.hashCode(); + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java index 3fca4a9..3922d78 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java @@ -101,34 +101,34 @@ public LazyGenotypesContext.LazyData parse(final Object data) { return codec.createGenotypeMap(str, alleles, chrom, start); } } -} -// This is a HACK. But, the functionality is only in AbstractVCFCodec so it -// can't be helped. This is preferable to copying the functionality into -// parse() above. -class HeaderSettableVCFCodec extends AbstractVCFCodec { + // This is a HACK. But, the functionality is only in AbstractVCFCodec so it + // can't be helped. This is preferable to copying the functionality into + // parse() above. + static class HeaderSettableVCFCodec extends AbstractVCFCodec { - public boolean hasHeader() { - return header != null; - } + public boolean hasHeader() { + return header != null; + } - public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { - this.header = header; - this.version = ver; - } + public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { + this.header = header; + this.version = ver; + } - @Override - public Object readActualHeader(LineIterator reader) { - throw new UnsupportedOperationException("Internal error: this shouldn't be called"); - } + @Override + public Object readActualHeader(LineIterator reader) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } - @Override - public List parseFilters(String filterString) { - throw new UnsupportedOperationException("Internal error: this shouldn't be called"); - } + @Override + public List parseFilters(String filterString) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } - @Override - public boolean canDecode(String s) { - return true; + @Override + public boolean canDecode(String s) { + return true; + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java index d6159d4..0a52d08 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java @@ -191,172 +191,172 @@ public boolean nextKeyValue() { record.set(r); return true; } -} -// See the long comment in SAMRecordReader.initialize() for what this does. -class WorkaroundingStream extends InputStream { + // See the long comment in SAMRecordReader.initialize() for what this does. + static class WorkaroundingStream extends InputStream { - private final InputStream stream, headerStream; - private boolean headerRemaining; - private long length; - private int headerLength; + private final InputStream stream, headerStream; + private boolean headerRemaining; + private long length; + private int headerLength; - private boolean lookingForEOL = false, - foundEOL = false, - strippingAts = false; // HACK, see read(byte[], int, int). - private byte[] readBuf = new byte[1]; + private boolean lookingForEOL = false, + foundEOL = false, + strippingAts = false; // HACK, see read(byte[], int, int). + private byte[] readBuf = new byte[1]; - public WorkaroundingStream(InputStream stream, SAMFileHeader header) { - this.stream = stream; + public WorkaroundingStream(InputStream stream, SAMFileHeader header) { + this.stream = stream; - String text = header.getTextHeader(); - if (text == null) { - StringWriter writer = new StringWriter(); - new SAMTextHeaderCodec().encode(writer, header); - text = writer.toString(); - } - byte[] b; - try { - b = text.getBytes("UTF-8"); - } catch (UnsupportedEncodingException e) { - b = null; - assert false; - } - headerRemaining = true; - headerLength = b.length; - headerStream = new ByteArrayInputStream(b); + String text = header.getTextHeader(); + if (text == null) { + StringWriter writer = new StringWriter(); + new SAMTextHeaderCodec().encode(writer, header); + text = writer.toString(); + } + byte[] b; + try { + b = text.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + b = null; + assert false; + } + headerRemaining = true; + headerLength = b.length; + headerStream = new ByteArrayInputStream(b); - this.length = Long.MAX_VALUE; - } + this.length = Long.MAX_VALUE; + } - public void setLength(long length) { - this.length = length; - } + public void setLength(long length) { + this.length = length; + } - public int getRemainingHeaderLength() { - return headerLength; - } + public int getRemainingHeaderLength() { + return headerLength; + } - @Override - public int read() throws IOException { - for (; ; ) { - switch (read(readBuf)) { - case 0: - continue; - case 1: - return readBuf[0]; - case -1: - return -1; + @Override + public int read() throws IOException { + for (; ; ) { + switch (read(readBuf)) { + case 0: + continue; + case 1: + return readBuf[0]; + case -1: + return -1; + } } } - } - @Override - public int read(byte[] buf, int off, int len) throws IOException { - if (!headerRemaining) { - return streamRead(buf, off, len); - } + @Override + public int read(byte[] buf, int off, int len) throws IOException { + if (!headerRemaining) { + return streamRead(buf, off, len); + } - int h; - if (strippingAts) { - h = 0; - } else { - h = headerStream.read(buf, off, len); - if (h == -1) { - // This should only happen when there was no header at all, in - // which case Picard doesn't throw an error until trying to read - // a record, for some reason. (Perhaps an oversight.) Thus we - // need to handle that case here. - assert (headerLength == 0); + int h; + if (strippingAts) { h = 0; - } else if (h < headerLength) { - headerLength -= h; - return h; + } else { + h = headerStream.read(buf, off, len); + if (h == -1) { + // This should only happen when there was no header at all, in + // which case Picard doesn't throw an error until trying to read + // a record, for some reason. (Perhaps an oversight.) Thus we + // need to handle that case here. + assert (headerLength == 0); + h = 0; + } else if (h < headerLength) { + headerLength -= h; + return h; + } + strippingAts = true; + headerStream.close(); } - strippingAts = true; - headerStream.close(); - } - final int newOff = off + h; - int s = streamRead(buf, newOff, len - h); + final int newOff = off + h; + int s = streamRead(buf, newOff, len - h); - if (s <= 0) { - return strippingAts ? s : h; - } + if (s <= 0) { + return strippingAts ? s : h; + } - // HACK HACK HACK. - // - // We gave all of the header, which means that SAMFileReader is still - // trying to read more header lines. If we're in a split that isn't at - // the start of the SAM file, we could be in the middle of a line and - // thus see @ characters at the start of our data. Then SAMFileReader - // would try to understand those as header lines and the end result is - // that it throws an error, since they aren't actually header lines, - // they're just part of a SAM record. - // - // So, if we're done with the header, strip all @ characters we see. Thus - // SAMFileReader will stop reading the header there and won't throw an - // exception until we use its SAMRecordIterator, at which point we can - // catch it, because we know to expect it. - // - // headerRemaining remains true while it's possible that there are still - // @ characters coming. + // HACK HACK HACK. + // + // We gave all of the header, which means that SAMFileReader is still + // trying to read more header lines. If we're in a split that isn't at + // the start of the SAM file, we could be in the middle of a line and + // thus see @ characters at the start of our data. Then SAMFileReader + // would try to understand those as header lines and the end result is + // that it throws an error, since they aren't actually header lines, + // they're just part of a SAM record. + // + // So, if we're done with the header, strip all @ characters we see. Thus + // SAMFileReader will stop reading the header there and won't throw an + // exception until we use its SAMRecordIterator, at which point we can + // catch it, because we know to expect it. + // + // headerRemaining remains true while it's possible that there are still + // @ characters coming. + + int i = newOff - 1; + while (buf[++i] == '@' && --s > 0) {; + } - int i = newOff - 1; - while (buf[++i] == '@' && --s > 0) {; - } + if (i != newOff) { + System.arraycopy(buf, i, buf, newOff, s); + } - if (i != newOff) { - System.arraycopy(buf, i, buf, newOff, s); + headerRemaining = s == 0; + return h + s; } - headerRemaining = s == 0; - return h + s; - } - - private int streamRead(byte[] buf, int off, int len) throws IOException { - if (len > length) { - if (foundEOL) { - return 0; + private int streamRead(byte[] buf, int off, int len) throws IOException { + if (len > length) { + if (foundEOL) { + return 0; + } + lookingForEOL = true; } - lookingForEOL = true; - } - int n = stream.read(buf, off, len); - if (n > 0) { - n = tryFindEOL(buf, off, n); - length -= n; + int n = stream.read(buf, off, len); + if (n > 0) { + n = tryFindEOL(buf, off, n); + length -= n; + } + return n; } - return n; - } - private int tryFindEOL(byte[] buf, int off, int len) { - assert !foundEOL; + private int tryFindEOL(byte[] buf, int off, int len) { + assert !foundEOL; - if (!lookingForEOL || len < length) { - return len; - } + if (!lookingForEOL || len < length) { + return len; + } - // Find the first EOL between length and len. + // Find the first EOL between length and len. - // len >= length so length fits in an int. - int i = Math.max(0, (int) length - 1); + // len >= length so length fits in an int. + int i = Math.max(0, (int) length - 1); - for (; i < len; ++i) { - if (buf[off + i] == '\n') { - foundEOL = true; - return i + 1; + for (; i < len; ++i) { + if (buf[off + i] == '\n') { + foundEOL = true; + return i + 1; + } } + return len; } - return len; - } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException { + stream.close(); + } - @Override - public int available() throws IOException { - return headerRemaining ? headerStream.available() : stream.available(); + @Override + public int available() throws IOException { + return headerRemaining ? headerStream.available() : stream.available(); + } } }