Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new bit-compression #6691

Merged
merged 1 commit into from
Jul 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ public enum QueryMode {
)
private String cohortTable = null;

@Argument(
fullName = "use-compressed-data",
doc = "If true, use bit-packed fields for data",
optional = true)
private boolean useCompressedData = false;

@Argument(
fullName = "print-debug-information",
doc = "If true, print extra debugging output",
Expand Down Expand Up @@ -171,7 +177,7 @@ protected void onStartup() {
probeIdMap,
cohortTable,
localSortMaxRecordsInRam,
false,
useCompressedData,
printDebugInformation,
progressMeter);
vcfWriter.writeHeader(header);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.ProgressMeter;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.tools.variantdb.RawArrayData.ArrayGenotype;
import org.broadinstitute.hellbender.tools.variantdb.BasicArrayData.ArrayGenotype;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -122,8 +122,8 @@ private void createVariantsFromUngroupedTableResult(final GATKAvroReader avroRea
for ( final GenericRecord sortedRow : sortingCollection ) {
long probeId;
if (useCompressedData) {
final long rawData = (Long) sortedRow.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME);
RawArrayData data = RawArrayData.decode(rawData);
final long bits = (Long) sortedRow.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME);
BasicArrayData data = new BasicArrayData(bits);
probeId = data.probeId;
} else {
probeId = (Long) sortedRow.get("probe_id");
Expand Down Expand Up @@ -162,7 +162,17 @@ private void processSampleRecordsForLocation(final long probeId, final Iterable<
int numRecordsAtPosition = 0;

for ( final GenericRecord sampleRecord : sampleRecordsAtPosition ) {
final long sampleId = (Long) sampleRecord.get(SchemaUtils.SAMPLE_ID_FIELD_NAME);
final long sampleId;
if (useCompressedData) {
final long bits = (Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME);
BasicArrayData data = new BasicArrayData(bits);
sampleId = data.sampleId;
} else {
sampleId = (Long) sampleRecord.get(SchemaUtils.SAMPLE_ID_FIELD_NAME);

// TODO: hack to test roundtrip

}

// TODO: handle missing values
String sampleName = sampleIdMap.get((int) sampleId);
Expand Down Expand Up @@ -275,19 +285,22 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob
List<Allele> genotypeAlleles = new ArrayList<Allele>();

if (this.useCompressedData) {
final RawArrayData data = RawArrayData.decode((Long) sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME));
normx = data.normx;
normy = data.normy;
lrr = data.lrr;
baf = data.baf;
final BasicArrayData basicData = new BasicArrayData((Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME));
Object rd = sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME);

final RawArrayData rawData = new RawArrayData((Long) rd);
normx = rawData.normx;
normy = rawData.normy;
lrr = rawData.lrr;
baf = rawData.baf;

if (data.genotype == ArrayGenotype.AA) {
if (basicData.genotype == ArrayGenotype.AA) {
genotypeAlleles.add(alleleA);
genotypeAlleles.add(alleleA);
} else if (data.genotype == ArrayGenotype.AB) {
} else if (basicData.genotype == ArrayGenotype.AB) {
genotypeAlleles.add(alleleA);
genotypeAlleles.add(alleleB);
} else if (data.genotype == ArrayGenotype.BB) {
} else if (basicData.genotype == ArrayGenotype.BB) {
genotypeAlleles.add(alleleB);
genotypeAlleles.add(alleleB);
} else {
Expand Down Expand Up @@ -326,16 +339,11 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob
lrr = getNullableFloatFromDouble(sampleRecord.get("LRR"));

// Hack to pack and unpack data
RawArrayData d = new RawArrayData();
d.probeId = (int) probeInfo.probeId;
d.genotype = agt;
d.baf = baf;
d.lrr = lrr;
d.normx = normx;
d.normy = normy;
BasicArrayData b = new BasicArrayData(0, (int) probeInfo.probeId, agt);
RawArrayData d = new RawArrayData(normx, normy, lrr, baf);

long bits = d.encode();
RawArrayData d2 = RawArrayData.decode(bits);
RawArrayData d2 = new RawArrayData(bits);
normx = d2.normx;
normy = d2.normy;
baf = d2.baf;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package org.broadinstitute.hellbender.tools.variantdb;

import static org.broadinstitute.hellbender.tools.variantdb.BinaryUtils.*;

import org.broadinstitute.hellbender.exceptions.GATKException;

public class BasicArrayData {
public static enum ArrayGenotype {
// Order is critical here, the ordinal is the int encoding
AA,AB, BB, NO_CALL
}

public int sampleId;
public int probeId;
public ArrayGenotype genotype;

public static final int GT_LENGTH = 2;
public static final int PROBE_ID_LENGTH = 30;
public static final int MAX_PROBE_ID_VALUE = (int) Math.pow(2, PROBE_ID_LENGTH) - 1;

public static final int SAMPLE_ID_LENGTH = 30;
public static final int MAX_SAMPLE_ID_VALUE = (int) Math.pow(2, SAMPLE_ID_LENGTH) - 1;

public static final int GT_OFFSET = 0;
public static final int PROBE_ID_OFFSET = GT_OFFSET + GT_LENGTH;
public static final int SAMPLE_ID_OFFSET = PROBE_ID_OFFSET + PROBE_ID_LENGTH;

public BasicArrayData(int sampleId, int probeId, ArrayGenotype genotype) {
// check that the sizes fit
if (sampleId < 0 || sampleId > MAX_SAMPLE_ID_VALUE) {
throw new GATKException("Attempted sample id of " + sampleId + " which is great than the maximum of " + MAX_SAMPLE_ID_VALUE);
}

if (probeId < 0 || probeId > MAX_PROBE_ID_VALUE) {
throw new GATKException("Attempted sample id of " + probeId + " which is great than the maximum of " + MAX_PROBE_ID_VALUE);
}

this.sampleId = sampleId;
this.probeId = probeId;
this.genotype = genotype;
}

public BasicArrayData(long bits) {
this.genotype = decodeGenotype((int) extractBits(bits, GT_OFFSET, GT_LENGTH));
this.probeId = (int) extractBits(bits, PROBE_ID_OFFSET, PROBE_ID_LENGTH);
this.sampleId = (int) extractBits(bits, SAMPLE_ID_OFFSET, SAMPLE_ID_LENGTH);
}

private ArrayGenotype decodeGenotype(int i) {
return ArrayGenotype.values()[i];
}

private int encodeGenotype(ArrayGenotype g) {
return g.ordinal();
}

public long encode() {
return (
((long) encodeGenotype(this.genotype) << GT_OFFSET) |
((long) this.probeId << PROBE_ID_OFFSET ) |
((long) this.sampleId << SAMPLE_ID_OFFSET )
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ public static SortingCollection<GenericRecord> getAvroProbeIdSortingCollection(o
final static Comparator<GenericRecord> COMPRESSED_PROBE_ID_COMPARATOR = new Comparator<GenericRecord>() {
@Override
public int compare( GenericRecord o1, GenericRecord o2 ) {
final long firstProbeId = RawArrayData.decode((Long) o1.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME)).probeId;
final long secondProbeId = RawArrayData.decode((Long) o2.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME)).probeId;
final long firstProbeId = new BasicArrayData((Long) o1.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId;
final long secondProbeId = new BasicArrayData((Long) o2.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId;

return Long.compare(firstProbeId, secondProbeId);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,76 +1,109 @@
package org.broadinstitute.hellbender.tools.variantdb;

import java.math.*;
import static org.broadinstitute.hellbender.tools.variantdb.BinaryUtils.*;

public class RawArrayData {
public static enum ArrayGenotype {
// Order is critical here, the ordinal is the int encoding
AA,AB, BB, NO_CALL
}

// TODO: turn these all into getters/setters with precision checks (e.g. baf)
int probeId;
ArrayGenotype genotype;
Float normx;
Float normy;
Float baf;
Float lrr;

static ArrayGenotype decodeGenotype(int i) {
return ArrayGenotype.values()[i];
public Float normx;
public Float normy;
public Float baf;
public Float lrr;

public RawArrayData(Float normx, Float normy, Float baf, Float lrr) {
this.normx = normx;
this.normy = normy;
this.baf = baf;
this.lrr = lrr;
}

static int encodeGenotype(ArrayGenotype g) {
return g.ordinal();
public static final int NORMX_OFFSET = 0;
public static final int NORMY_OFFSET = 16;
public static final int LRR_OFFSET = 32;
public static final int BAF_OFFSET = 48;

private static final int MIN_16_BIT_VALUE = 0;
private static final int MAX_16_BIT_VALUE = (int) Math.pow(2, 16) - 2; // reserve for null
private static final int NULL_ENCODING = MAX_16_BIT_VALUE + 1;

private static final int MIN_10_BIT_VALUE = 0;
private static final int MAX_10_BIT_VALUE = (int) Math.pow(2, 10) - 2; // reserve for null
private static final int NULL_10_BIT_ENCODING = MAX_10_BIT_VALUE + 1;

// store a float with 3-decimal digits in 16 bits by
// multiplying by 1000 and capping values, reserving
// xFFFF FFFF to represent null
public static int encode(Float f) {
return encode(f,0);
}
public static int encode(Float f, float offset) {

// TODO: fix to be 10-bit null encoding also...
if (f == null) return NULL_ENCODING;

public static final int LRR_OFFSET = 0;
public static final float LRR_MIN = -28;
public static final float LRR_MAX = 7;
return
Math.min(
Math.max(
Math.round((f+offset) * 1000.0f),
MIN_16_BIT_VALUE
),
MAX_16_BIT_VALUE
);
}

public static final int BAF_OFFSET = 8;
public static final float BAF_MIN = 0;
public static final float BAF_MAX = 1;
public static int encode10bits(Float f, float offset) {

public static final int NORMX_OFFSET = 16;
public static final float NORMX_MIN = 0;
public static final float NORMX_MAX = 8;
if (f == null) return NULL_10_BIT_ENCODING;

public static final int NORMY_OFFSET = 24;
public static final float NORMY_MIN = 0;
public static final float NORMY_MAX = 8;
return
Math.min(
Math.max(
Math.round((f+offset) * 1000.0f),
MIN_10_BIT_VALUE
),
MAX_10_BIT_VALUE
);
}

public static final int GT_OFFSET = 32;
public static final int PROBE_ID_OFFSET = 42;
public static Float decode(long i) {
return decode(i, 0);
}

// GTC Data Ranges: https://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf
public static RawArrayData decode(long bits) {
public static Float decode(long i, float offset) {
if (i == NULL_ENCODING) return null;
return (
(float) i) / 1000.0f - offset;
}

RawArrayData data = new RawArrayData();
data.lrr = decodeFrom8Bits((int) extractBits(bits, LRR_OFFSET, 8), LRR_MIN, LRR_MAX);
data.baf = decodeFrom8Bits((int) extractBits(bits, BAF_OFFSET, 8), BAF_MIN, BAF_MAX);
data.normx = decodeFrom8Bits((int) extractBits(bits, NORMX_OFFSET, 8), NORMX_MIN, NORMX_MAX);
data.normy = decodeFrom8Bits((int) extractBits(bits, NORMY_OFFSET, 8), NORMY_MIN, NORMY_MAX);
data.genotype = decodeGenotype((int) extractBits(bits, GT_OFFSET, 2));
data.probeId = (int) extractBits(bits, PROBE_ID_OFFSET, 22);
public static Float decode10bits(long i) {
if (i == NULL_10_BIT_ENCODING) return null;
return (
(float) i) / 1000.0f;
}

return data;
public RawArrayData(long bits) {
try {
this.normx = decode(extractBits(bits, NORMX_OFFSET, 16));
this.normy = decode(extractBits(bits, NORMY_OFFSET, 16));
this.lrr = decode(extractBits(bits, LRR_OFFSET, 16), 32.0f);
this.baf = decode10bits(extractBits(bits, BAF_OFFSET, 10));
} catch (NullPointerException npe) {
npe.printStackTrace();
throw npe;
}
}

public long encode() {
long lrrBits = encodeTo8Bits(this.lrr, LRR_MIN, LRR_MAX);
long bafBits = encodeTo8Bits(this.baf, BAF_MIN, BAF_MAX);
long normxBits = encodeTo8Bits(this.normx, NORMX_MIN, NORMX_MAX);
long normyBits = encodeTo8Bits(this.normy, NORMX_MIN, NORMX_MAX);
long gtBits = (long) encodeGenotype(this.genotype);
long normxBits = encode(this.normx);
long normyBits = encode(this.normy);
long lrrBits = encode(this.lrr, 32.0f);
long bafBits = encode10bits(this.baf, 0.0f);

return (
(lrrBits << LRR_OFFSET) |
(bafBits << BAF_OFFSET) |
(normxBits << NORMX_OFFSET) |
(normxBits << NORMX_OFFSET) |
(normyBits << NORMY_OFFSET) |
(gtBits << GT_OFFSET) |
((long) this.probeId << PROBE_ID_OFFSET )
(lrrBits << LRR_OFFSET) |
(bafBits << BAF_OFFSET)
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ public class SchemaUtils {

public static final String SAMPLE_NAME_FIELD_NAME = "sample_name";
public static final String SAMPLE_ID_FIELD_NAME = "sample_id";

public static final String BASIC_ARRAY_DATA_FIELD_NAME = "basic_array_data";
public static final String RAW_ARRAY_DATA_FIELD_NAME = "raw_array_data";

// TODO remove this one - we should not have this ambiguous field
Expand All @@ -35,7 +37,9 @@ public class SchemaUtils {
public static final List<String> COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_NAME_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, "call_GT", "call_GQ", "call_RGQ");
public static final List<String> ARRAY_COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_NAME_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, "call_GT", "call_GQ");

public static final List<String> RAW_ARRAY_COHORT_FIELDS_COMPRESSED = Arrays.asList(SAMPLE_ID_FIELD_NAME, RAW_ARRAY_DATA_FIELD_NAME);
public static final List<String> RAW_ARRAY_COHORT_FIELDS_COMPRESSED =
Arrays.asList(BASIC_ARRAY_DATA_FIELD_NAME, RAW_ARRAY_DATA_FIELD_NAME);

public static final List<String> RAW_ARRAY_COHORT_FIELDS_UNCOMPRESSED =
Arrays.asList(SAMPLE_ID_FIELD_NAME, "probe_id", "GT_encoded","NORMX","NORMY","BAF","LRR");

Expand Down
Loading