diff --git a/conf/log4j-compression.properties b/conf/log4j-compression.properties new file mode 100644 index 00000000000..2f251773da4 --- /dev/null +++ b/conf/log4j-compression.properties @@ -0,0 +1,12 @@ +log4j.rootLogger=ERROR, console + +log4j.logger.org.apache.sysds=INFO +log4j.logger.org.apache.sysds.runtime.compress=DEBUG +log4j.logger.org.apache.spark=ERROR +log4j.logger.org.apache.spark.SparkContext=OFF +log4j.logger.org.apache.hadoop=ERROR + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java index b31ef4afddf..77c1b3ed239 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java @@ -359,7 +359,7 @@ private void classifyPhase() { private void coCodePhase() { compressionGroups = CoCoderFactory.findCoCodesByPartitioning(informationExtractor, compressionGroups, k, - costEstimator, compSettings); + costEstimator, compSettings, true); _stats.estimatedSizeCoCoded = compressionGroups.memoryEstimate(); _stats.estimatedCostCoCoded = costEstimator.getCost(compressionGroups); diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index 6c560fb9792..0a6a5d35ca3 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -54,16 +54,20 @@ public enum PartitionerType { * @param k The concurrency degree allowed for this operation. * @param costEstimator The Cost estimator to estimate the cost of the compression * @param cs The compression settings used in the compression. + * @param detectOneHotEncoding Flag to Enable/Disable OHE Detection * @return The estimated (hopefully) best groups of ColGroups. */ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, - ACostEstimate costEstimator, CompressionSettings cs) { + ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) { // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); // Find out if any of the groups are empty. final boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); + if (detectOneHotEncoding) { + LOG.info("Flag Correct"); + } // if there are no empty or const columns then try cocode algorithms for all columns if(!containsEmptyConstOrIncompressable) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index a4030d95612..a607424f4c2 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -57,7 +57,7 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { - UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional; + UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, OHE; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index f382f7b2f71..4d0aeb4dc02 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -29,6 +29,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; @@ -40,6 +41,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -282,6 +284,10 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { return compressSDCSingleColDirectBlock(colIndexes, cg.getNumVals()); } + else if(ct == CompressionType.OHE) { + return compressOHE(colIndexes,cg.getNumVals()); + } + final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs); if(ubm == null) // no values ... therefore empty return new ColGroupEmpty(colIndexes); @@ -312,6 +318,25 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { } } + private AColGroup compressOHE(IColIndex colIndexes, int numVals) { + //There are some edge cases, can be optimized further + // You have to make sure that it is actually OHE + // Make an evil case that the input is OHE except maybe the final row + if(cs.transposed){ + throw new NotImplementedException("Not implemented"); + } + AMapToData data = MapToFactory.create(in.getNumRows(), numVals); + for(int r=0;r cocodeCols = ColGroupFactory.compressColGroups(mb, cocodeGroups, cs, k); double actualCostCoCode = ce.getCost(cocodeCols, nRows);