Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
smyomous committed Jun 8, 2024
1 parent b1cb505 commit d621272
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 4 deletions.
12 changes: 12 additions & 0 deletions conf/log4j-compression.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
log4j.rootLogger=ERROR, console

log4j.logger.org.apache.sysds=INFO
log4j.logger.org.apache.sysds.runtime.compress=DEBUG
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.apache.spark.SparkContext=OFF
log4j.logger.org.apache.hadoop=ERROR

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ private void classifyPhase() {
private void coCodePhase() {

compressionGroups = CoCoderFactory.findCoCodesByPartitioning(informationExtractor, compressionGroups, k,
costEstimator, compSettings);
costEstimator, compSettings, true);

_stats.estimatedSizeCoCoded = compressionGroups.memoryEstimate();
_stats.estimatedCostCoCoded = costEstimator.getCost(compressionGroups);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,20 @@ public enum PartitionerType {
* @param k The concurrency degree allowed for this operation.
* @param costEstimator The Cost estimator to estimate the cost of the compression
* @param cs The compression settings used in the compression.
* @param detectOneHotEncoding Flag to Enable/Disable OHE Detection
* @return The estimated (hopefully) best groups of ColGroups.
*/
public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k,
ACostEstimate costEstimator, CompressionSettings cs) {
ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) {

// Use column group partitioner to create partitions of columns
AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs);

// Find out if any of the groups are empty.
final boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos);
if (detectOneHotEncoding) {
LOG.info("Flag Correct");
}

// if there are no empty or const columns then try cocode algorithms for all columns
if(!containsEmptyConstOrIncompressable)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public abstract class AColGroup implements Serializable {

/** Public super types of compression ColGroups supported */
public static enum CompressionType {
UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional;
UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, OHE;

public boolean isDense() {
return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;

import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
Expand All @@ -40,6 +41,7 @@
import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary;
import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression;
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
Expand Down Expand Up @@ -282,6 +284,10 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) {
return compressSDCSingleColDirectBlock(colIndexes, cg.getNumVals());
}

else if(ct == CompressionType.OHE) {
return compressOHE(colIndexes,cg.getNumVals());
}

final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs);
if(ubm == null) // no values ... therefore empty
return new ColGroupEmpty(colIndexes);
Expand Down Expand Up @@ -312,6 +318,25 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) {
}
}

private AColGroup compressOHE(IColIndex colIndexes, int numVals) {
//There are some edge cases, can be optimized further
// You have to make sure that it is actually OHE
// Make an evil case that the input is OHE except maybe the final row
if(cs.transposed){
throw new NotImplementedException("Not implemented");
}
AMapToData data = MapToFactory.create(in.getNumRows(), numVals);
for(int r=0;r<in.getNumRows();r++){
for(int c=0;c<colIndexes.size();c++){
if(in.get(r, colIndexes.get(c))==1){
data.set(r, c);
break;
}
}
}
return ColGroupDDC.create(colIndexes, new IdentityDictionary(numVals), data, null);
}

private AColGroup compressSDCSingleColDirectBlock(IColIndex colIndexes, int nVal) {
final DoubleCountHashMap cMap = new DoubleCountHashMap(nVal);
final int col = colIndexes.get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public void testCostEstimate() {
double actualCostIndividual = ce.getCost(individualCols, nRows);

// cocode
CompressedSizeInfo cocodeGroups = CoCoderFactory.findCoCodesByPartitioning(ie, individualGroups, k, ce, cs);
CompressedSizeInfo cocodeGroups = CoCoderFactory.findCoCodesByPartitioning(ie, individualGroups, k, ce, cs, true);
double estimatedCostCoCode = ce.getCost(cocodeGroups);
List<AColGroup> cocodeCols = ColGroupFactory.compressColGroups(mb, cocodeGroups, cs, k);
double actualCostCoCode = ce.getCost(cocodeCols, nRows);
Expand Down

0 comments on commit d621272

Please sign in to comment.