Skip to content

Commit

Permalink
new bit-compression (#6691)
Browse files Browse the repository at this point in the history
  • Loading branch information
kcibul authored and ahaessly committed Aug 13, 2020
1 parent 3007854 commit c552d19
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 129 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ public enum QueryMode {
)
private String cohortTable = null;

@Argument(
fullName = "use-compressed-data",
doc = "If true, use bit-packed fields for data",
optional = true)
private boolean useCompressedData = false;

@Argument(
fullName = "print-debug-information",
doc = "If true, print extra debugging output",
Expand Down Expand Up @@ -171,7 +177,7 @@ protected void onStartup() {
probeIdMap,
cohortTable,
localSortMaxRecordsInRam,
false,
useCompressedData,
printDebugInformation,
progressMeter);
vcfWriter.writeHeader(header);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.ProgressMeter;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.tools.variantdb.RawArrayData.ArrayGenotype;
import org.broadinstitute.hellbender.tools.variantdb.BasicArrayData.ArrayGenotype;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -122,8 +122,8 @@ private void createVariantsFromUngroupedTableResult(final GATKAvroReader avroRea
for ( final GenericRecord sortedRow : sortingCollection ) {
long probeId;
if (useCompressedData) {
final long rawData = (Long) sortedRow.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME);
RawArrayData data = RawArrayData.decode(rawData);
final long bits = (Long) sortedRow.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME);
BasicArrayData data = new BasicArrayData(bits);
probeId = data.probeId;
} else {
probeId = (Long) sortedRow.get("probe_id");
Expand Down Expand Up @@ -162,7 +162,17 @@ private void processSampleRecordsForLocation(final long probeId, final Iterable<
int numRecordsAtPosition = 0;

for ( final GenericRecord sampleRecord : sampleRecordsAtPosition ) {
final long sampleId = (Long) sampleRecord.get(SchemaUtils.SAMPLE_ID_FIELD_NAME);
final long sampleId;
if (useCompressedData) {
final long bits = (Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME);
BasicArrayData data = new BasicArrayData(bits);
sampleId = data.sampleId;
} else {
sampleId = (Long) sampleRecord.get(SchemaUtils.SAMPLE_ID_FIELD_NAME);

// TODO: hack to test roundtrip

}

// TODO: handle missing values
String sampleName = sampleIdMap.get((int) sampleId);
Expand Down Expand Up @@ -275,19 +285,22 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob
List<Allele> genotypeAlleles = new ArrayList<Allele>();

if (this.useCompressedData) {
final RawArrayData data = RawArrayData.decode((Long) sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME));
normx = data.normx;
normy = data.normy;
lrr = data.lrr;
baf = data.baf;
final BasicArrayData basicData = new BasicArrayData((Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME));
Object rd = sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME);

final RawArrayData rawData = new RawArrayData((Long) rd);
normx = rawData.normx;
normy = rawData.normy;
lrr = rawData.lrr;
baf = rawData.baf;

if (data.genotype == ArrayGenotype.AA) {
if (basicData.genotype == ArrayGenotype.AA) {
genotypeAlleles.add(alleleA);
genotypeAlleles.add(alleleA);
} else if (data.genotype == ArrayGenotype.AB) {
} else if (basicData.genotype == ArrayGenotype.AB) {
genotypeAlleles.add(alleleA);
genotypeAlleles.add(alleleB);
} else if (data.genotype == ArrayGenotype.BB) {
} else if (basicData.genotype == ArrayGenotype.BB) {
genotypeAlleles.add(alleleB);
genotypeAlleles.add(alleleB);
} else {
Expand Down Expand Up @@ -326,16 +339,11 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob
lrr = getNullableFloatFromDouble(sampleRecord.get("LRR"));

// Hack to pack and unpack data
RawArrayData d = new RawArrayData();
d.probeId = (int) probeInfo.probeId;
d.genotype = agt;
d.baf = baf;
d.lrr = lrr;
d.normx = normx;
d.normy = normy;
BasicArrayData b = new BasicArrayData(0, (int) probeInfo.probeId, agt);
RawArrayData d = new RawArrayData(normx, normy, lrr, baf);

long bits = d.encode();
RawArrayData d2 = RawArrayData.decode(bits);
RawArrayData d2 = new RawArrayData(bits);
normx = d2.normx;
normy = d2.normy;
baf = d2.baf;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package org.broadinstitute.hellbender.tools.variantdb;

import static org.broadinstitute.hellbender.tools.variantdb.BinaryUtils.*;

import org.broadinstitute.hellbender.exceptions.GATKException;

public class BasicArrayData {
public static enum ArrayGenotype {
// Order is critical here, the ordinal is the int encoding
AA,AB, BB, NO_CALL
}

public int sampleId;
public int probeId;
public ArrayGenotype genotype;

public static final int GT_LENGTH = 2;
public static final int PROBE_ID_LENGTH = 30;
public static final int MAX_PROBE_ID_VALUE = (int) Math.pow(2, PROBE_ID_LENGTH) - 1;

public static final int SAMPLE_ID_LENGTH = 30;
public static final int MAX_SAMPLE_ID_VALUE = (int) Math.pow(2, SAMPLE_ID_LENGTH) - 1;

public static final int GT_OFFSET = 0;
public static final int PROBE_ID_OFFSET = GT_OFFSET + GT_LENGTH;
public static final int SAMPLE_ID_OFFSET = PROBE_ID_OFFSET + PROBE_ID_LENGTH;

public BasicArrayData(int sampleId, int probeId, ArrayGenotype genotype) {
// check that the sizes fit
if (sampleId < 0 || sampleId > MAX_SAMPLE_ID_VALUE) {
throw new GATKException("Attempted sample id of " + sampleId + " which is great than the maximum of " + MAX_SAMPLE_ID_VALUE);
}

if (probeId < 0 || probeId > MAX_PROBE_ID_VALUE) {
throw new GATKException("Attempted sample id of " + probeId + " which is great than the maximum of " + MAX_PROBE_ID_VALUE);
}

this.sampleId = sampleId;
this.probeId = probeId;
this.genotype = genotype;
}

public BasicArrayData(long bits) {
this.genotype = decodeGenotype((int) extractBits(bits, GT_OFFSET, GT_LENGTH));
this.probeId = (int) extractBits(bits, PROBE_ID_OFFSET, PROBE_ID_LENGTH);
this.sampleId = (int) extractBits(bits, SAMPLE_ID_OFFSET, SAMPLE_ID_LENGTH);
}

private ArrayGenotype decodeGenotype(int i) {
return ArrayGenotype.values()[i];
}

private int encodeGenotype(ArrayGenotype g) {
return g.ordinal();
}

public long encode() {
return (
((long) encodeGenotype(this.genotype) << GT_OFFSET) |
((long) this.probeId << PROBE_ID_OFFSET ) |
((long) this.sampleId << SAMPLE_ID_OFFSET )
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ public static SortingCollection<GenericRecord> getAvroProbeIdSortingCollection(o
final static Comparator<GenericRecord> COMPRESSED_PROBE_ID_COMPARATOR = new Comparator<GenericRecord>() {
@Override
public int compare( GenericRecord o1, GenericRecord o2 ) {
final long firstProbeId = RawArrayData.decode((Long) o1.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME)).probeId;
final long secondProbeId = RawArrayData.decode((Long) o2.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME)).probeId;
final long firstProbeId = new BasicArrayData((Long) o1.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId;
final long secondProbeId = new BasicArrayData((Long) o2.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId;

return Long.compare(firstProbeId, secondProbeId);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,76 +1,109 @@
package org.broadinstitute.hellbender.tools.variantdb;

import java.math.*;
import static org.broadinstitute.hellbender.tools.variantdb.BinaryUtils.*;

public class RawArrayData {
public static enum ArrayGenotype {
// Order is critical here, the ordinal is the int encoding
AA,AB, BB, NO_CALL
}

// TODO: turn these all into getters/setters with precision checks (e.g. baf)
int probeId;
ArrayGenotype genotype;
Float normx;
Float normy;
Float baf;
Float lrr;

static ArrayGenotype decodeGenotype(int i) {
return ArrayGenotype.values()[i];
public Float normx;
public Float normy;
public Float baf;
public Float lrr;

public RawArrayData(Float normx, Float normy, Float baf, Float lrr) {
this.normx = normx;
this.normy = normy;
this.baf = baf;
this.lrr = lrr;
}

static int encodeGenotype(ArrayGenotype g) {
return g.ordinal();
public static final int NORMX_OFFSET = 0;
public static final int NORMY_OFFSET = 16;
public static final int LRR_OFFSET = 32;
public static final int BAF_OFFSET = 48;

private static final int MIN_16_BIT_VALUE = 0;
private static final int MAX_16_BIT_VALUE = (int) Math.pow(2, 16) - 2; // reserve for null
private static final int NULL_ENCODING = MAX_16_BIT_VALUE + 1;

private static final int MIN_10_BIT_VALUE = 0;
private static final int MAX_10_BIT_VALUE = (int) Math.pow(2, 10) - 2; // reserve for null
private static final int NULL_10_BIT_ENCODING = MAX_10_BIT_VALUE + 1;

// store a float with 3-decimal digits in 16 bits by
// multiplying by 1000 and capping values, reserving
// xFFFF FFFF to represent null
public static int encode(Float f) {
return encode(f,0);
}
public static int encode(Float f, float offset) {

// TODO: fix to be 10-bit null encoding also...
if (f == null) return NULL_ENCODING;

public static final int LRR_OFFSET = 0;
public static final float LRR_MIN = -28;
public static final float LRR_MAX = 7;
return
Math.min(
Math.max(
Math.round((f+offset) * 1000.0f),
MIN_16_BIT_VALUE
),
MAX_16_BIT_VALUE
);
}

public static final int BAF_OFFSET = 8;
public static final float BAF_MIN = 0;
public static final float BAF_MAX = 1;
public static int encode10bits(Float f, float offset) {

public static final int NORMX_OFFSET = 16;
public static final float NORMX_MIN = 0;
public static final float NORMX_MAX = 8;
if (f == null) return NULL_10_BIT_ENCODING;

public static final int NORMY_OFFSET = 24;
public static final float NORMY_MIN = 0;
public static final float NORMY_MAX = 8;
return
Math.min(
Math.max(
Math.round((f+offset) * 1000.0f),
MIN_10_BIT_VALUE
),
MAX_10_BIT_VALUE
);
}

public static final int GT_OFFSET = 32;
public static final int PROBE_ID_OFFSET = 42;
public static Float decode(long i) {
return decode(i, 0);
}

// GTC Data Ranges: https://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf
public static RawArrayData decode(long bits) {
public static Float decode(long i, float offset) {
if (i == NULL_ENCODING) return null;
return (
(float) i) / 1000.0f - offset;
}

RawArrayData data = new RawArrayData();
data.lrr = decodeFrom8Bits((int) extractBits(bits, LRR_OFFSET, 8), LRR_MIN, LRR_MAX);
data.baf = decodeFrom8Bits((int) extractBits(bits, BAF_OFFSET, 8), BAF_MIN, BAF_MAX);
data.normx = decodeFrom8Bits((int) extractBits(bits, NORMX_OFFSET, 8), NORMX_MIN, NORMX_MAX);
data.normy = decodeFrom8Bits((int) extractBits(bits, NORMY_OFFSET, 8), NORMY_MIN, NORMY_MAX);
data.genotype = decodeGenotype((int) extractBits(bits, GT_OFFSET, 2));
data.probeId = (int) extractBits(bits, PROBE_ID_OFFSET, 22);
public static Float decode10bits(long i) {
if (i == NULL_10_BIT_ENCODING) return null;
return (
(float) i) / 1000.0f;
}

return data;
public RawArrayData(long bits) {
try {
this.normx = decode(extractBits(bits, NORMX_OFFSET, 16));
this.normy = decode(extractBits(bits, NORMY_OFFSET, 16));
this.lrr = decode(extractBits(bits, LRR_OFFSET, 16), 32.0f);
this.baf = decode10bits(extractBits(bits, BAF_OFFSET, 10));
} catch (NullPointerException npe) {
npe.printStackTrace();
throw npe;
}
}

public long encode() {
long lrrBits = encodeTo8Bits(this.lrr, LRR_MIN, LRR_MAX);
long bafBits = encodeTo8Bits(this.baf, BAF_MIN, BAF_MAX);
long normxBits = encodeTo8Bits(this.normx, NORMX_MIN, NORMX_MAX);
long normyBits = encodeTo8Bits(this.normy, NORMX_MIN, NORMX_MAX);
long gtBits = (long) encodeGenotype(this.genotype);
long normxBits = encode(this.normx);
long normyBits = encode(this.normy);
long lrrBits = encode(this.lrr, 32.0f);
long bafBits = encode10bits(this.baf, 0.0f);

return (
(lrrBits << LRR_OFFSET) |
(bafBits << BAF_OFFSET) |
(normxBits << NORMX_OFFSET) |
(normxBits << NORMX_OFFSET) |
(normyBits << NORMY_OFFSET) |
(gtBits << GT_OFFSET) |
((long) this.probeId << PROBE_ID_OFFSET )
(lrrBits << LRR_OFFSET) |
(bafBits << BAF_OFFSET)
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ public class SchemaUtils {

public static final String SAMPLE_NAME_FIELD_NAME = "sample_name";
public static final String SAMPLE_ID_FIELD_NAME = "sample_id";

public static final String BASIC_ARRAY_DATA_FIELD_NAME = "basic_array_data";
public static final String RAW_ARRAY_DATA_FIELD_NAME = "raw_array_data";

// TODO remove this one - we should not have this ambiguous field
Expand All @@ -35,7 +37,9 @@ public class SchemaUtils {
public static final List<String> COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_NAME_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, "call_GT", "call_GQ", "call_RGQ");
public static final List<String> ARRAY_COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_NAME_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, "call_GT", "call_GQ");

public static final List<String> RAW_ARRAY_COHORT_FIELDS_COMPRESSED = Arrays.asList(SAMPLE_ID_FIELD_NAME, RAW_ARRAY_DATA_FIELD_NAME);
public static final List<String> RAW_ARRAY_COHORT_FIELDS_COMPRESSED =
Arrays.asList(BASIC_ARRAY_DATA_FIELD_NAME, RAW_ARRAY_DATA_FIELD_NAME);

public static final List<String> RAW_ARRAY_COHORT_FIELDS_UNCOMPRESSED =
Arrays.asList(SAMPLE_ID_FIELD_NAME, "probe_id", "GT_encoded","NORMX","NORMY","BAF","LRR");

Expand Down
Loading

0 comments on commit c552d19

Please sign in to comment.