Skip to content

Commit

Permalink
CRAM: Only calculate alignmentDelta as needed for records (#1304)
Browse files Browse the repository at this point in the history
* Only calculate alignmentDelta as needed
- alignmentStart is the sole source of truth
- Simplify isPlaced() and tests
- Make a few tests APDelta-agnostic
- read() returns new aln start
  • Loading branch information
jmthibault79 authored Feb 27, 2019
1 parent 62388a2 commit b58a5a9
Show file tree
Hide file tree
Showing 14 changed files with 146 additions and 167 deletions.
7 changes: 1 addition & 6 deletions src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -317,18 +317,14 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx
containerFactory.setPreserveReadNames(preserveReadNames);

int index = 0;
int prevAlStart = start;
for (final SAMRecord samRecord : samRecords) {
if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && refSeqIndex != samRecord.getReferenceIndex()) {
// this may load all ref sequences into memory:
sam2CramRecordFactory.setRefBases(source.getReferenceBases(samFileHeader.getSequence(samRecord.getReferenceIndex()), true));
}
final CramCompressionRecord cramRecord = sam2CramRecordFactory.createCramRecord(samRecord);
cramRecord.index = ++index;
cramRecord.alignmentDelta = samRecord.getAlignmentStart() - prevAlStart;
cramRecord.alignmentStart = samRecord.getAlignmentStart();
prevAlStart = samRecord.getAlignmentStart();

cramRecords.add(cramRecord);

if (preservation != null) preservation.addQualityScores(samRecord, cramRecord, tracks);
Expand Down Expand Up @@ -379,8 +375,7 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx
while (last.next != null) last = last.next;

if (cramRecord.isFirstSegment() && last.isLastSegment()) {
final boolean coordinateSorted = (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate);
final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last, coordinateSorted);
final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last);

if (cramRecord.templateSize == templateLength) {
last = cramRecord.next;
Expand Down
10 changes: 3 additions & 7 deletions src/main/java/htsjdk/samtools/cram/build/ContainerParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,14 @@ private ArrayList<CramCompressionRecord> getRecords(final Slice slice,

final ArrayList<CramCompressionRecord> records = new ArrayList<>(slice.nofRecords);

int prevStart = slice.alignmentStart;
int prevAlignmentStart = slice.alignmentStart;
for (int i = 0; i < slice.nofRecords; i++) {
final CramCompressionRecord record = new CramCompressionRecord();
record.sliceIndex = slice.index;
record.index = i;

reader.read(record);
// read the new record and update the running prevAlignmentStart
prevAlignmentStart = reader.read(record, prevAlignmentStart);

if (record.sequenceId == slice.sequenceId) {
record.sequenceName = seqName;
Expand All @@ -140,11 +141,6 @@ private ArrayList<CramCompressionRecord> getRecords(final Slice slice,
}

records.add(record);

if (header.APDelta) {
prevStart += record.alignmentDelta;
record.alignmentStart = prevStart;
}
}

return records;
Expand Down
16 changes: 6 additions & 10 deletions src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
for (final CramCompressionRecord record : records) {
if (record.previous != null) continue;
if (record.next == null) continue;
final boolean usePositionDeltaEncoding = header.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
restoreMateInfo(record, usePositionDeltaEncoding);
restoreMateInfo(record);
}
}

Expand Down Expand Up @@ -138,8 +137,7 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
restoreQualityScores(defaultQualityScore, records);
}

private static void restoreMateInfo(final CramCompressionRecord record,
final boolean usePositionDeltaEncoding) {
private static void restoreMateInfo(final CramCompressionRecord record) {
if (record.next == null) {

return;
Expand All @@ -157,7 +155,7 @@ private static void restoreMateInfo(final CramCompressionRecord record,
// record.setFirstSegment(true);
// last.setLastSegment(true);

final int templateLength = computeInsertSize(record, last, usePositionDeltaEncoding);
final int templateLength = computeInsertSize(record, last);
record.templateSize = templateLength;
last.templateSize = -templateLength;
}
Expand Down Expand Up @@ -328,21 +326,19 @@ private static byte getByteOrDefault(final byte[] array, final int pos,
*
* @param firstEnd first mate of the pair
* @param secondEnd second mate of the pair
* @param usePositionDeltaEncoding do these records delta-encode their alignment starts?
* @return template length
*/
public static int computeInsertSize(final CramCompressionRecord firstEnd,
final CramCompressionRecord secondEnd,
final boolean usePositionDeltaEncoding) {
final CramCompressionRecord secondEnd) {
if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) {
return 0;
}
if (firstEnd.sequenceId != secondEnd.sequenceId) {
return 0;
}

final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd(usePositionDeltaEncoding) : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd(usePositionDeltaEncoding) : secondEnd.alignmentStart;
final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd() : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd() : secondEnd.alignmentStart;

final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1;
return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ private <T> DataSeriesReader<T> createDataReader(DataSeries dataSeries) {
* Read a Cram Compression Record, using this class's Encodings
*
* @param cramRecord the Cram Compression Record to read into
* @param prevAlignmentStart the alignmentStart of the previous record, for delta calculation
* @return the alignmentStart of the newly-read record
*/
public void read(final CramCompressionRecord cramRecord) {
public int read(final CramCompressionRecord cramRecord, final int prevAlignmentStart) {
try {
// int mark = testCodec.readData();
// if (Writer.TEST_MARK != mark) {
Expand All @@ -181,7 +183,7 @@ public void read(final CramCompressionRecord cramRecord) {

cramRecord.readLength = readLengthCodec.readData();
if (APDelta) {
cramRecord.alignmentDelta = alignmentStartCodec.readData();
cramRecord.alignmentStart = prevAlignmentStart + alignmentStartCodec.readData();
} else {
cramRecord.alignmentStart = alignmentStartCodec.readData();
}
Expand Down Expand Up @@ -329,5 +331,8 @@ public void read(final CramCompressionRecord cramRecord) {
}
throw new RuntimeIOException(e);
}

return cramRecord.alignmentStart;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
*/
public class MultiRefSliceAlignmentSpanReader extends CramRecordReader {
/**
* Alignment start to start counting from
* Alignment start of the previous record, for delta-encoding if necessary
*/
private int currentAlignmentStart;
private int prevAlignmentStart;

/**
* Detected sequence spans
Expand All @@ -60,7 +60,7 @@ public MultiRefSliceAlignmentSpanReader(final BitInputStream coreInputStream,
final int recordCount) {
super(coreInputStream, externalInputMap, header, Slice.MULTI_REFERENCE, validationStringency);

this.currentAlignmentStart = initialAlignmentStart;
this.prevAlignmentStart = initialAlignmentStart;

for (int i = 0; i < recordCount; i++) {
readCramRecord();
Expand All @@ -73,18 +73,13 @@ public Map<Integer, AlignmentSpan> getReferenceSpans() {

private void readCramRecord() {
final CramCompressionRecord cramRecord = new CramCompressionRecord();
super.read(cramRecord);

if (APDelta) {
currentAlignmentStart += cramRecord.alignmentDelta;
} else {
currentAlignmentStart = cramRecord.alignmentStart;
}
prevAlignmentStart = super.read(cramRecord, prevAlignmentStart);

if (!spans.containsKey(cramRecord.sequenceId)) {
spans.put(cramRecord.sequenceId, new AlignmentSpan(currentAlignmentStart, cramRecord.readLength));
spans.put(cramRecord.sequenceId, new AlignmentSpan(cramRecord.alignmentStart, cramRecord.readLength));
} else {
spans.get(cramRecord.sequenceId).addSingle(currentAlignmentStart, cramRecord.readLength);
spans.get(cramRecord.sequenceId).addSingle(cramRecord.alignmentStart, cramRecord.readLength);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -146,22 +146,23 @@ private <T> DataSeriesWriter<T> createDataWriter(final DataSeries dataSeries) {
* Writes a series of Cram Compression Records, using this class's Encodings
*
* @param records the Cram Compression Records to write
* @param prevAlignmentStart the alignmentStart of the previous record, for delta calculation
* @param initialAlignmentStart the alignmentStart of the enclosing {@link Slice}, for delta calculation
*/
public void writeCramCompressionRecords(final List<CramCompressionRecord> records, int prevAlignmentStart) {
public void writeCramCompressionRecords(final List<CramCompressionRecord> records, final int initialAlignmentStart) {
int prevAlignmentStart = initialAlignmentStart;
for (final CramCompressionRecord record : records) {
record.alignmentDelta = record.alignmentStart - prevAlignmentStart;
writeRecord(record, prevAlignmentStart);
prevAlignmentStart = record.alignmentStart;
writeRecord(record);
}
}

/**
* Write a Cram Compression Record, using this class's Encodings
*
* @param r the Cram Compression Record to write
* @param prevAlignmentStart the alignmentStart of the previous record, for delta calculation
*/
private void writeRecord(final CramCompressionRecord r) {
private void writeRecord(final CramCompressionRecord r, final int prevAlignmentStart) {
bitFlagsC.writeData(r.flags);
compBitFlagsC.writeData(r.getCompressionFlags());
if (refId == Slice.MULTI_REFERENCE) {
Expand All @@ -171,7 +172,8 @@ private void writeRecord(final CramCompressionRecord r) {
readLengthC.writeData(r.readLength);

if (AP_delta) {
alStartC.writeData(r.alignmentDelta);
final int alignmentDelta = r.alignmentStart - prevAlignmentStart;
alStartC.writeData(alignmentDelta);
} else {
alStartC.writeData(r.alignmentStart);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ public class CramCompressionRecord {
// sequential index of the record in a stream:
public int index = 0;
public int alignmentStart;
public int alignmentDelta;
private int alignmentEnd = UNINITIALIZED_END;
private int alignmentSpan = UNINITIALIZED_SPAN;

Expand Down Expand Up @@ -129,7 +128,6 @@ public boolean equals(final Object obj) {

if (!Arrays.equals(readBases, cramRecord.readBases)) return false;
return Arrays.equals(qualityScores, cramRecord.qualityScores) && areEqual(flags, cramRecord.flags) && areEqual(readName, cramRecord.readName);

}

private boolean areEqual(final Object o1, final Object o2) {
Expand All @@ -156,7 +154,7 @@ public String toString() {
final StringBuilder stringBuilder = new StringBuilder("[");
if (readName != null) stringBuilder.append(readName).append("; ");
stringBuilder.append("flags=").append(flags)
.append("; alignmentOffset=").append(alignmentDelta)
.append("; alignmentStart=").append(alignmentStart)
.append("; mateOffset=").append(recordsToNextFragment)
.append("; mappingQuality=").append(mappingQuality);

Expand All @@ -172,33 +170,31 @@ public String toString() {

/**
* Check whether alignmentSpan has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentSpan
*/
public int getAlignmentSpan(final boolean usePositionDeltaEncoding) {
public int getAlignmentSpan() {
if (alignmentSpan == UNINITIALIZED_SPAN) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
intializeAlignmentBoundaries();
}
return alignmentSpan;
}

/**
* Check whether alignmentEnd has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentEnd
*/
public int getAlignmentEnd(final boolean usePositionDeltaEncoding) {
public int getAlignmentEnd() {
if (alignmentEnd == UNINITIALIZED_END) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
intializeAlignmentBoundaries();
}
return alignmentEnd;
}

// https://github.com/samtools/htsjdk/issues/1301
// does not update alignmentSpan/alignmentEnd when the record changes

private void intializeAlignmentBoundaries(final boolean usePositionDeltaEncoding) {
if (!isPlaced(usePositionDeltaEncoding)) {
private void intializeAlignmentBoundaries() {
if (!isPlaced()) {
alignmentSpan = Slice.NO_ALIGNMENT_SPAN;
alignmentEnd = Slice.NO_ALIGNMENT_END;
return;
Expand Down Expand Up @@ -244,7 +240,7 @@ public void setMultiFragment(final boolean multiFragment) {
* Unmapped records may be stored in the same {@link Slice}s and {@link Container}s as mapped
* records if they are placed.
*
* @see #isPlaced(boolean)
* @see #isPlaced()
* @return true if the record is unmapped
*/
public boolean isSegmentUnmapped() {
Expand All @@ -258,19 +254,19 @@ public void setSegmentUnmapped(final boolean segmentUnmapped) {
/**
* Does this record have a valid placement/alignment location?
*
* It must have a valid reference sequence ID to be considered placed.
* In the case of absolute (non-delta) position encoding, it must also have a
* valid alignment start position, so we need to know if it is delta-encoded.
* It must have a valid reference sequence ID and a valid alignment start position
* to be considered placed.
*
* Normally we expect to see that the unmapped flag is set for unplaced reads,
* so we log a WARNING here if the read is unplaced yet somehow mapped.
*
* @see #isSegmentUnmapped()
* @param usePositionDeltaEncoding is this read's position delta-encoded? if not, check alignment Start.
* @return true if the record is placed
*/
public boolean isPlaced(final boolean usePositionDeltaEncoding) {
boolean placed = isPlacedInternal(!usePositionDeltaEncoding);
boolean isPlaced() {
// placement requires a valid sequence ID and alignment start coordinate
boolean placed = sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX &&
alignmentStart != SAMRecord.NO_ALIGNMENT_START;

if (!placed && !isSegmentUnmapped()) {
final String warning = String.format(
Expand All @@ -283,17 +279,6 @@ public boolean isPlaced(final boolean usePositionDeltaEncoding) {
return placed;
}

// check placement without regard to mapping; helper method for isPlaced()
private boolean isPlacedInternal(final boolean useAbsolutePositionEncoding) {
// placement requires a valid sequence ID
if (sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return false;
}

// if an absolute alignment start coordinate is required but we have none, it's unplaced
return ! (useAbsolutePositionEncoding && alignmentStart == SAMRecord.NO_ALIGNMENT_START);
}

public boolean isFirstSegment() {
return (flags & FIRST_SEGMENT_FLAG) != 0;
}
Expand Down Expand Up @@ -407,5 +392,4 @@ public boolean isSupplementary() {
public void setSupplementary(final boolean supplementary) {
flags = supplementary ? flags | SUPPLEMENTARY_FLAG : flags & ~SUPPLEMENTARY_FLAG;
}

}
8 changes: 4 additions & 4 deletions src/main/java/htsjdk/samtools/cram/structure/Slice.java
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
// Unmapped: all records are unmapped and unplaced

final CramCompressionRecord firstRecord = records.get(0);
slice.sequenceId = firstRecord.isPlaced(header.APDelta) ?
slice.sequenceId = firstRecord.isPlaced() ?
firstRecord.sequenceId :
SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;

Expand All @@ -398,19 +398,19 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
hasher.add(record);

// flip an unmapped slice to multi-ref if the record is placed
if (slice.isUnmapped() && record.isPlaced(header.APDelta)) {
if (slice.isUnmapped() && record.isPlaced()) {
slice.sequenceId = MULTI_REFERENCE;
}
else if (slice.isMappedSingleRef()) {
// flip a single-ref slice to multi-ref if the record is unplaced or on a different ref
if (!record.isPlaced(header.APDelta) || slice.sequenceId != record.sequenceId) {
if (!record.isPlaced() || slice.sequenceId != record.sequenceId) {
slice.sequenceId = MULTI_REFERENCE;
}
else {
// calculate single ref slice alignment

minAlStart = Math.min(record.alignmentStart, minAlStart);
maxAlEnd = Math.max(record.getAlignmentEnd(header.APDelta), maxAlEnd);
maxAlEnd = Math.max(record.getAlignmentEnd(), maxAlEnd);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/htsjdk/samtools/CRAMBAIIndexerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ private static CramCompressionRecord createRecord(int recordIndex, int seqId, in
record.setSegmentUnmapped(false);
record.setMultiFragment(false);
record.sequenceId = seqId;
record.alignmentStart =start;
record.alignmentStart = start;
record.readBases = record.qualityScores = bases;
record.readName = Integer.toString(recordIndex);
record.readLength = readLength;
Expand Down
Loading

0 comments on commit b58a5a9

Please sign in to comment.