Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first commit #1

Merged
merged 1 commit into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions conf/log4j-compression.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
log4j.rootLogger=ERROR, console

log4j.logger.org.apache.sysds=INFO
log4j.logger.org.apache.sysds.runtime.compress=DEBUG
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.apache.spark.SparkContext=OFF
log4j.logger.org.apache.hadoop=ERROR

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ private void classifyPhase() {
private void coCodePhase() {

compressionGroups = CoCoderFactory.findCoCodesByPartitioning(informationExtractor, compressionGroups, k,
costEstimator, compSettings);
costEstimator, compSettings, true);

_stats.estimatedSizeCoCoded = compressionGroups.memoryEstimate();
_stats.estimatedCostCoCoded = costEstimator.getCost(compressionGroups);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,20 @@ public enum PartitionerType {
* @param k The concurrency degree allowed for this operation.
* @param costEstimator The Cost estimator to estimate the cost of the compression
* @param cs The compression settings used in the compression.
* @param detectOneHotEncoding Flag to Enable/Disable OHE Detection
* @return The estimated (hopefully) best groups of ColGroups.
*/
public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k,
ACostEstimate costEstimator, CompressionSettings cs) {
ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) {

// Use column group partitioner to create partitions of columns
AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs);

// Find out if any of the groups are empty.
final boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos);
if (detectOneHotEncoding) {
LOG.info("Flag Correct");
}

// if there are no empty or const columns then try cocode algorithms for all columns
if(!containsEmptyConstOrIncompressable)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public abstract class AColGroup implements Serializable {

/** Public super types of compression ColGroups supported */
public static enum CompressionType {
UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional;
UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, OHE;

public boolean isDense() {
return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;

import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
Expand All @@ -40,6 +41,7 @@
import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary;
import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression;
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
Expand Down Expand Up @@ -282,6 +284,10 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) {
return compressSDCSingleColDirectBlock(colIndexes, cg.getNumVals());
}

else if(ct == CompressionType.OHE) {
return compressOHE(colIndexes,cg.getNumVals());
}

final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs);
if(ubm == null) // no values ... therefore empty
return new ColGroupEmpty(colIndexes);
Expand Down Expand Up @@ -312,6 +318,25 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) {
}
}

private AColGroup compressOHE(IColIndex colIndexes, int numVals) {
//There are some edge cases, can be optimized further
// You have to make sure that it is actually OHE
// Make an evil case that the input is OHE except maybe the final row
if(cs.transposed){
throw new NotImplementedException("Not implemented");
}
AMapToData data = MapToFactory.create(in.getNumRows(), numVals);
for(int r=0;r<in.getNumRows();r++){
for(int c=0;c<colIndexes.size();c++){
if(in.get(r, colIndexes.get(c))==1){
data.set(r, c);
break;
}
}
}
return ColGroupDDC.create(colIndexes, new IdentityDictionary(numVals), data, null);
}

private AColGroup compressSDCSingleColDirectBlock(IColIndex colIndexes, int nVal) {
final DoubleCountHashMap cMap = new DoubleCountHashMap(nVal);
final int col = colIndexes.get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public void testCostEstimate() {
double actualCostIndividual = ce.getCost(individualCols, nRows);

// cocode
CompressedSizeInfo cocodeGroups = CoCoderFactory.findCoCodesByPartitioning(ie, individualGroups, k, ce, cs);
CompressedSizeInfo cocodeGroups = CoCoderFactory.findCoCodesByPartitioning(ie, individualGroups, k, ce, cs, true);
double estimatedCostCoCode = ce.getCost(cocodeGroups);
List<AColGroup> cocodeCols = ColGroupFactory.compressColGroups(mb, cocodeGroups, cs, k);
double actualCostCoCode = ce.getCost(cocodeCols, nRows);
Expand Down