From d621272ce2d1e2205cec7dea6d174bd9bf53e33a Mon Sep 17 00:00:00 2001 From: Youssef Date: Sat, 8 Jun 2024 12:36:20 +0200 Subject: [PATCH 01/11] first commit --- conf/log4j-compression.properties | 12 +++++++++ .../CompressedMatrixBlockFactory.java | 2 +- .../compress/cocode/CoCoderFactory.java | 6 ++++- .../runtime/compress/colgroup/AColGroup.java | 2 +- .../compress/colgroup/ColGroupFactory.java | 25 +++++++++++++++++++ .../component/compress/cost/ACostTest.java | 2 +- 6 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 conf/log4j-compression.properties diff --git a/conf/log4j-compression.properties b/conf/log4j-compression.properties new file mode 100644 index 00000000000..2f251773da4 --- /dev/null +++ b/conf/log4j-compression.properties @@ -0,0 +1,12 @@ +log4j.rootLogger=ERROR, console + +log4j.logger.org.apache.sysds=INFO +log4j.logger.org.apache.sysds.runtime.compress=DEBUG +log4j.logger.org.apache.spark=ERROR +log4j.logger.org.apache.spark.SparkContext=OFF +log4j.logger.org.apache.hadoop=ERROR + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java index b31ef4afddf..77c1b3ed239 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java @@ -359,7 +359,7 @@ private void classifyPhase() { private void coCodePhase() { compressionGroups = CoCoderFactory.findCoCodesByPartitioning(informationExtractor, compressionGroups, k, - costEstimator, compSettings); + costEstimator, compSettings, true); _stats.estimatedSizeCoCoded = compressionGroups.memoryEstimate(); _stats.estimatedCostCoCoded = costEstimator.getCost(compressionGroups); diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index 6c560fb9792..0a6a5d35ca3 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -54,16 +54,20 @@ public enum PartitionerType { * @param k The concurrency degree allowed for this operation. * @param costEstimator The Cost estimator to estimate the cost of the compression * @param cs The compression settings used in the compression. + * @param detectOneHotEncoding Flag to Enable/Disable OHE Detection * @return The estimated (hopefully) best groups of ColGroups. */ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, - ACostEstimate costEstimator, CompressionSettings cs) { + ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) { // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); // Find out if any of the groups are empty. final boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); + if (detectOneHotEncoding) { + LOG.info("Flag Correct"); + } // if there are no empty or const columns then try cocode algorithms for all columns if(!containsEmptyConstOrIncompressable) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index a4030d95612..a607424f4c2 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -57,7 +57,7 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { - UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional; + UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, OHE; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index f382f7b2f71..4d0aeb4dc02 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -29,6 +29,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; @@ -40,6 +41,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -282,6 +284,10 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { return compressSDCSingleColDirectBlock(colIndexes, cg.getNumVals()); } + else if(ct == CompressionType.OHE) { + return compressOHE(colIndexes,cg.getNumVals()); + } + final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs); if(ubm == null) // no values ... therefore empty return new ColGroupEmpty(colIndexes); @@ -312,6 +318,25 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { } } + private AColGroup compressOHE(IColIndex colIndexes, int numVals) { + //There are some edge cases, can be optimized further + // You have to make sure that it is actually OHE + // Make an evil case that the input is OHE except maybe the final row + if(cs.transposed){ + throw new NotImplementedException("Not implemented"); + } + AMapToData data = MapToFactory.create(in.getNumRows(), numVals); + for(int r=0;r cocodeCols = ColGroupFactory.compressColGroups(mb, cocodeGroups, cs, k); double actualCostCoCode = ce.getCost(cocodeCols, nRows); From 1395d7ed38fd1db2d4d7b5d027a117381fcb81c7 Mon Sep 17 00:00:00 2001 From: Youssef Date: Tue, 18 Jun 2024 16:03:01 +0200 Subject: [PATCH 02/11] Add more logic --- .../compress/cocode/CoCoderFactory.java | 69 ++++++++++++++++--- .../compress/colgroup/ColGroupFactory.java | 14 +++- .../compress/colgroup/ColGroupSizes.java | 10 +++ .../estim/CompressedSizeInfoColGroup.java | 8 ++- .../estim/encoding/SparseEncoding.java | 4 +- 5 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index 0a6a5d35ca3..a7f15fe7cdf 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -63,16 +63,14 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); - // Find out if any of the groups are empty. - final boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); - if (detectOneHotEncoding) { - LOG.info("Flag Correct"); - } + boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); - // if there are no empty or const columns then try cocode algorithms for all columns - if(!containsEmptyConstOrIncompressable) - return co.coCodeColumns(colInfos, k); + // If no empty, constant, incompressible groups and not OHE, cocode all columns + if (!containsEmptyConstOrIncompressable && !detectOneHotEncoding) { + return co.coCodeColumns(colInfos, k); + } else { + // filtered empty groups final List emptyCols = new ArrayList<>(); // filtered const groups @@ -85,6 +83,9 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress final int nRow = colInfos.compressionInfo.get(0).getNumRows(); // filter groups + List currentCandidates = new ArrayList<>(); + List> oheGroups = new ArrayList<>(); + for(int i = 0; i < colInfos.compressionInfo.size(); i++) { CompressedSizeInfoColGroup g = colInfos.compressionInfo.get(i); if(g.isEmpty()) @@ -93,13 +94,33 @@ else if(g.isConst()) constCols.add(g.getColumns()); else if(g.isIncompressable()) incompressable.add(g.getColumns()); - else + else if (isCandidate(g)) { + currentCandidates.add(g); + if (isHotEncoded(currentCandidates)) { + oheGroups.add(new ArrayList<>(currentCandidates)); + currentCandidates.clear(); + } + } else { groups.add(g); + if (!currentCandidates.isEmpty()) { + currentCandidates.clear(); + } + } } // overwrite groups. colInfos.compressionInfo = groups; + for (List oheGroup : oheGroups) { + final List oheIndexes = new ArrayList<>(); + for (CompressedSizeInfoColGroup g : oheGroup) { + oheIndexes.add(g.getColumns()); + } + final IColIndex idx = ColIndexFactory.combineIndexes(oheIndexes); + groups.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.OHE)); + // colInfos.compressionInfo.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.OHE)); + } + // cocode remaining groups if(!groups.isEmpty()) { colInfos = co.coCodeColumns(colInfos, k); @@ -121,7 +142,7 @@ else if(g.isIncompressable()) final IColIndex idx = ColIndexFactory.combineIndexes(incompressable); colInfos.compressionInfo.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.UNCOMPRESSED)); } - + return colInfos; } @@ -134,6 +155,34 @@ private static boolean containsEmptyConstOrIncompressable(CompressedSizeInfo col return false; } + private static boolean isCandidate(CompressedSizeInfoColGroup g) { + // Check if the column has exactly 2 distinct value other than 0 + return g.getNumVals() == 2; + } + + private static boolean isHotEncoded(List colGroups) { + if (colGroups.isEmpty()) { + return false; + } + + int numCols = colGroups.size(); + int totalNumVals = 0; + int totalNumOffs = 0; + int numRows = colGroups.get(0).getNumRows(); + + for (CompressedSizeInfoColGroup g : colGroups) { + totalNumVals += g.getNumVals(); + totalNumOffs += g.getNumOffs(); + } + + if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { + return false; + } + + LOG.info("ColGroup is OHE"); + return true; + } + private static AColumnCoCoder createColumnGroupPartitioner(PartitionerType type, AComEst est, ACostEstimate costEstimator, CompressionSettings cs) { switch(type) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 4d0aeb4dc02..871197220d6 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -318,14 +318,21 @@ else if(ct == CompressionType.OHE) { } } + private AColGroup compressOHE(IColIndex colIndexes, int numVals) { //There are some edge cases, can be optimized further // You have to make sure that it is actually OHE - // Make an evil case that the input is OHE except maybe the final row - if(cs.transposed){ + // Ensure numVals is valid + if (numVals <= 0) { + throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); + } + + // Check if the matrix is transposed + if(cs.transposed) { throw new NotImplementedException("Not implemented"); } - AMapToData data = MapToFactory.create(in.getNumRows(), numVals); + + AMapToData data = MapToFactory.create(in.getNumRows(), numVals); for(int r=0;r(CompressionType.class); switch(ct) { case EMPTY: + _facts = new EstimationFactors(1, nRows); _sizes.put(ct, (double) ColGroupSizes.estimateInMemorySizeEMPTY(columns.size(), columns.isContiguous())); break; case CONST: + _facts = new EstimationFactors(1, nRows); _sizes.put(ct, (double) ColGroupSizes.estimateInMemorySizeCONST(columns.size(), columns.isContiguous(), 1.0, false)); break; case UNCOMPRESSED: + _facts = new EstimationFactors(nRows, nRows); _sizes.put(ct, (double) ColGroupSizes.estimateInMemorySizeUncompressed(nRows, columns.isContiguous(), columns.size(), 1.0)); break; + case OHE: + _facts = new EstimationFactors(columns.size(), nRows); + _sizes.put(ct, (double) ColGroupSizes.estimateInMemorySizeOHE(columns.size(), columns.isContiguous(), nRows)); + break; default: throw new DMLCompressionException("Invalid instantiation of const Cost"); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/SparseEncoding.java b/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/SparseEncoding.java index ffe365127af..69ea2bde175 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/SparseEncoding.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/SparseEncoding.java @@ -391,10 +391,10 @@ public EstimationFactors extractFacts(int nRows, double tupleSparsity, double ma final int[] counts = map.getCounts(); if(cs.isRLEAllowed()) - return new EstimationFactors(map.getUnique(), map.size(), largestOffs, counts, 0, nRows, map.countRuns(off), + return new EstimationFactors(getUnique(), map.size(), largestOffs, counts, 0, nRows, map.countRuns(off), false, true, matrixSparsity, tupleSparsity); else - return new EstimationFactors(map.getUnique(), map.size(), largestOffs, counts, 0, nRows, false, true, + return new EstimationFactors(getUnique(), map.size(), largestOffs, counts, 0, nRows, false, true, matrixSparsity, tupleSparsity); } From 78e3a1f79a3af32add7fe24d4e6b69deb1fbd0c4 Mon Sep 17 00:00:00 2001 From: Youssef Date: Wed, 10 Jul 2024 10:25:27 +0200 Subject: [PATCH 03/11] Fixed empty cocoders and handle transpose in compressOHE --- .../compress/cocode/CoCoderFactory.java | 50 +++++++++++++--- .../compress/colgroup/ColGroupFactory.java | 60 +++++++++++++++---- .../sysds/runtime/compress/estim/AComEst.java | 4 ++ .../runtime/compress/estim/ComEstSample.java | 5 +- .../compress/estim/CompressedSizeInfo.java | 2 + 5 files changed, 100 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index a7f15fe7cdf..fd0aea92298 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -25,6 +25,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.compress.DMLCompressionException; import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -60,12 +61,15 @@ public enum PartitionerType { public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) { + detectOneHotEncoding=false; + + // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); - - // If no empty, constant, incompressible groups and not OHE, cocode all columns + + // If no empty, constant, incompressible groups and not OHE, cocode all columns if (!containsEmptyConstOrIncompressable && !detectOneHotEncoding) { return co.coCodeColumns(colInfos, k); } @@ -85,6 +89,11 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress // filter groups List currentCandidates = new ArrayList<>(); List> oheGroups = new ArrayList<>(); + boolean isSample=false; + if(est.getClass().getSimpleName().equals("ComEstSample")){ + isSample=true; + LOG.info("isSampleTrue"); + } for(int i = 0; i < colInfos.compressionInfo.size(); i++) { CompressedSizeInfoColGroup g = colInfos.compressionInfo.get(i); @@ -96,18 +105,28 @@ else if(g.isIncompressable()) incompressable.add(g.getColumns()); else if (isCandidate(g)) { currentCandidates.add(g); - if (isHotEncoded(currentCandidates)) { + if (isHotEncoded(currentCandidates, isSample)) { oheGroups.add(new ArrayList<>(currentCandidates)); currentCandidates.clear(); } } else { groups.add(g); if (!currentCandidates.isEmpty()) { - currentCandidates.clear(); + for(CompressedSizeInfoColGroup gg: currentCandidates) + groups.add(gg); + currentCandidates.clear(); } } } + // If currentCandidates is not empty, add it to groups + if (!currentCandidates.isEmpty()) { + for (CompressedSizeInfoColGroup gg : currentCandidates) { + groups.add(gg); + } + currentCandidates.clear(); + } + // overwrite groups. colInfos.compressionInfo = groups; @@ -122,6 +141,12 @@ else if (isCandidate(g)) { } // cocode remaining groups + if(colInfos.getInfo().size()<=0 && incompressable.size()<=0 && emptyCols.size()<=0 && constCols.size()==0 && oheGroups.size()<=0) + // Check why it's not added back + // Parsing out statistics to check the performance and if the sizes are smaller + // + throw new DMLCompressionException("empty cocoders 1"); + if(!groups.isEmpty()) { colInfos = co.coCodeColumns(colInfos, k); } @@ -142,10 +167,14 @@ else if (isCandidate(g)) { final IColIndex idx = ColIndexFactory.combineIndexes(incompressable); colInfos.compressionInfo.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.UNCOMPRESSED)); } + + if(colInfos.getInfo().size()<=0) + throw new DMLCompressionException("empty cocoders 2"); return colInfos; } + } private static boolean containsEmptyConstOrIncompressable(CompressedSizeInfo colInfos) { @@ -157,15 +186,17 @@ private static boolean containsEmptyConstOrIncompressable(CompressedSizeInfo col private static boolean isCandidate(CompressedSizeInfoColGroup g) { // Check if the column has exactly 2 distinct value other than 0 + LOG.info(g.getNumVals() + "-" + g.getNumRows() + "-" + g.getNumOffs()); return g.getNumVals() == 2; } - private static boolean isHotEncoded(List colGroups) { + private static boolean isHotEncoded(List colGroups, boolean isSample) { if (colGroups.isEmpty()) { return false; } int numCols = colGroups.size(); + LOG.info("numCols: " + numCols); int totalNumVals = 0; int totalNumOffs = 0; int numRows = colGroups.get(0).getNumRows(); @@ -174,10 +205,15 @@ private static boolean isHotEncoded(List colGroups) totalNumVals += g.getNumVals(); totalNumOffs += g.getNumOffs(); } - - if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { + + LOG.info("totalOffs: " + totalNumOffs + ", totalNumVals: " + totalNumVals); + double margin = isSample ? 0.0007 : 0.0; + if (totalNumVals / 2 != numCols || Math.abs(totalNumOffs - numRows) > numRows * margin) { return false; } + // if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { + // return false; + // } LOG.info("ColGroup is OHE"); return true; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 871197220d6..f482216cf54 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -319,31 +319,65 @@ else if(ct == CompressionType.OHE) { } + // Old implementation + // private AColGroup compressOHE(IColIndex colIndexes, int numVals) { + // //There are some edge cases, can be optimized further + // // You have to make sure that it is actually OHE + // // Ensure numVals is valid + // if (numVals <= 0) { + // throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); + // } + + // // Check if the matrix is transposed + // if(cs.transposed) { + // throw new NotImplementedException("Not implemented"); + // } + + // AMapToData data = MapToFactory.create(in.getNumRows(), numVals); + // for(int r=0;r 4) // Increase estimate if we get into many columns cocoding to be safe est += ((double) est) * ((double) nCol) / 10; // Bound the estimate with the maxDistinct. + if(sampleFacts.numRows>sampleFacts.numOffs) + est += 1; + // LOG.error(sampleFacts.numOffs + " - " + sampled, null); return Math.max(Math.min(est, Math.min(maxDistinct, numOffs)), 1); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java index fe87057f67a..61cca7da64b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java @@ -95,6 +95,8 @@ public String getEstimatedDistinct() { StringBuilder sb = new StringBuilder(); if(compressionInfo == null) return ""; + if(compressionInfo.size()<=0) + return ""; sb.append("["); sb.append(compressionInfo.get(0).getNumVals()); for(int i = 1; i < compressionInfo.size(); i++) From 155fa342d121784439e96f115c54fd925f64b69e Mon Sep 17 00:00:00 2001 From: Youssef Date: Wed, 10 Jul 2024 10:28:41 +0200 Subject: [PATCH 04/11] remove flag --- .../sysds/runtime/compress/cocode/CoCoderFactory.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index fd0aea92298..382393b7bbc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -61,9 +61,6 @@ public enum PartitionerType { public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) { - detectOneHotEncoding=false; - - // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); @@ -142,9 +139,6 @@ else if (isCandidate(g)) { // cocode remaining groups if(colInfos.getInfo().size()<=0 && incompressable.size()<=0 && emptyCols.size()<=0 && constCols.size()==0 && oheGroups.size()<=0) - // Check why it's not added back - // Parsing out statistics to check the performance and if the sizes are smaller - // throw new DMLCompressionException("empty cocoders 1"); if(!groups.isEmpty()) { From 1e04d95ff1532669da90276b5f139e581959945c Mon Sep 17 00:00:00 2001 From: Youssef Date: Mon, 15 Jul 2024 15:56:26 +0200 Subject: [PATCH 05/11] dmlConfig, transpose --- pom.xml | 2 +- .../java/org/apache/sysds/conf/DMLConfig.java | 4 +- .../CompressedMatrixBlockFactory.java | 2 +- .../runtime/compress/CompressionSettings.java | 7 +- .../compress/CompressionSettingsBuilder.java | 16 +++- .../compress/cocode/CoCoderFactory.java | 11 +-- .../compress/colgroup/ColGroupFactory.java | 88 ++++++++++++++++--- .../compress/estim/CompressedSizeInfo.java | 2 + .../component/compress/cost/ACostTest.java | 2 +- 9 files changed, 110 insertions(+), 24 deletions(-) diff --git a/pom.xml b/pom.xml index 144548a7103..0829025ccd7 100644 --- a/pom.xml +++ b/pom.xml @@ -77,7 +77,7 @@ false ** false - -Xms3000m -Xmx3000m -Xmn300m + -Xms3g -Xmx3g -Xmn300m false diff --git a/src/main/java/org/apache/sysds/conf/DMLConfig.java b/src/main/java/org/apache/sysds/conf/DMLConfig.java index dd4d3b2457f..11b1e060a3c 100644 --- a/src/main/java/org/apache/sysds/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysds/conf/DMLConfig.java @@ -88,6 +88,7 @@ public class DMLConfig public static final String COMPRESSED_COST_MODEL= "sysds.compressed.costmodel"; public static final String COMPRESSED_TRANSPOSE = "sysds.compressed.transpose"; public static final String COMPRESSED_TRANSFORMENCODE = "sysds.compressed.transformencode"; + public static final String COMPRESSED_ONEHOTDETECT = "sysds.compressed.onehotdetect"; public static final String NATIVE_BLAS = "sysds.native.blas"; public static final String NATIVE_BLAS_DIR = "sysds.native.blas.directory"; public static final String DAG_LINEARIZATION = "sysds.compile.linearization"; @@ -173,6 +174,7 @@ public class DMLConfig _defaultVals.put(COMPRESSED_COST_MODEL, "AUTO"); _defaultVals.put(COMPRESSED_TRANSPOSE, "auto"); _defaultVals.put(COMPRESSED_TRANSFORMENCODE, "false"); + _defaultVals.put(COMPRESSED_ONEHOTDETECT, "false"); _defaultVals.put(DAG_LINEARIZATION, DagLinearizer.DEPTH_FIRST.name()); _defaultVals.put(CODEGEN, "false" ); _defaultVals.put(CODEGEN_API, GeneratorAPI.JAVA.name() ); @@ -458,7 +460,7 @@ public String getConfigInfo() { CP_PARALLEL_OPS, CP_PARALLEL_IO, PARALLEL_ENCODE, NATIVE_BLAS, NATIVE_BLAS_DIR, COMPRESSED_LINALG, COMPRESSED_LOSSY, COMPRESSED_VALID_COMPRESSIONS, COMPRESSED_OVERLAPPING, COMPRESSED_SAMPLING_RATIO, COMPRESSED_SOFT_REFERENCE_COUNT, - COMPRESSED_COCODE, COMPRESSED_TRANSPOSE, COMPRESSED_TRANSFORMENCODE, DAG_LINEARIZATION, + COMPRESSED_COCODE, COMPRESSED_TRANSPOSE, COMPRESSED_TRANSFORMENCODE, COMPRESSED_ONEHOTDETECT, DAG_LINEARIZATION, CODEGEN, CODEGEN_API, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, STATS_MAX_WRAP_LEN, LINEAGECACHESPILL, COMPILERASSISTED_RW, BUFFERPOOL_LIMIT, MEMORY_MANAGER, PRINT_GPU_MEMORY_INFO, AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, GPU_RULE_BASED_PLACEMENT, diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java index 77c1b3ed239..b31ef4afddf 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java @@ -359,7 +359,7 @@ private void classifyPhase() { private void coCodePhase() { compressionGroups = CoCoderFactory.findCoCodesByPartitioning(informationExtractor, compressionGroups, k, - costEstimator, compSettings, true); + costEstimator, compSettings); _stats.estimatedSizeCoCoded = compressionGroups.memoryEstimate(); _stats.estimatedCostCoCoded = costEstimator.getCost(compressionGroups); diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 062ccfc1201..4fedbabe777 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -128,11 +128,14 @@ public class CompressionSettings { /** The sorting type used in sorting/joining offsets to create SDC groups */ public final SORT_TYPE sdcSortType; + /** Flag to detect one hot encodedness */ + public final boolean oneHotDetect; + protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, String transposeInput, int seed, boolean lossy, EnumSet validCompressions, boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType) { + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, boolean oneHotDetect) { this.samplingRatio = samplingRatio; this.samplePower = samplePower; this.allowSharedDictionary = allowSharedDictionary; @@ -151,6 +154,7 @@ protected CompressionSettings(double samplingRatio, double samplePower, boolean this.minimumCompressionRatio = minimumCompressionRatio; this.isInSparkInstruction = isInSparkInstruction; this.sdcSortType = sdcSortType; + this.oneHotDetect = oneHotDetect; if(LOG.isDebugEnabled()) LOG.debug(this.toString()); } @@ -168,6 +172,7 @@ public String toString() { sb.append("\t Partitioner: " + columnPartitioner); sb.append("\t Lossy: " + lossy); sb.append("\t Cost Computation Type: " + costComputationType); + sb.append("\t One Hot Encoding Check Flag: " + oneHotDetect); if(samplingRatio < 1.0) sb.append("\t Estimation Type: " + estimationType); return sb.toString(); diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index ec5512266e8..54f6ee07867 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -55,6 +55,7 @@ public class CompressionSettingsBuilder { private double minimumCompressionRatio = 1.0; private boolean isInSparkInstruction = false; private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; + private boolean oneHotDetect = false; public CompressionSettingsBuilder() { @@ -69,6 +70,7 @@ public CompressionSettingsBuilder() { costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); seed = DMLScript.SEED; + oneHotDetect = conf.getBooleanValue(DMLConfig.COMPRESSED_ONEHOTDETECT); } @@ -90,6 +92,7 @@ public CompressionSettingsBuilder copySettings(CompressionSettings that) { this.maxColGroupCoCode = that.maxColGroupCoCode; this.coCodePercentage = that.coCodePercentage; this.minimumSampleSize = that.minimumSampleSize; + this.oneHotDetect = that.oneHotDetect; return this; } @@ -170,6 +173,17 @@ public CompressionSettingsBuilder setSeed(int seed) { return this; } + /** + * Set the flag for detecting one hot encodedness for the compression operation. + * + * @param enabled The flag to enable or disable OHE checks. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setOneHotDetect(boolean enabled) { + this.oneHotDetect = enabled; + return this; + } + /** * Set the valid compression strategies used for the compression. * @@ -334,6 +348,6 @@ public CompressionSettings create() { return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, - sdcSortType); + sdcSortType, oneHotDetect); } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index 382393b7bbc..ebd137e263c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -55,19 +55,18 @@ public enum PartitionerType { * @param k The concurrency degree allowed for this operation. * @param costEstimator The Cost estimator to estimate the cost of the compression * @param cs The compression settings used in the compression. - * @param detectOneHotEncoding Flag to Enable/Disable OHE Detection * @return The estimated (hopefully) best groups of ColGroups. */ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, - ACostEstimate costEstimator, CompressionSettings cs, boolean detectOneHotEncoding) { - + ACostEstimate costEstimator, CompressionSettings cs) { + // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); // If no empty, constant, incompressible groups and not OHE, cocode all columns - if (!containsEmptyConstOrIncompressable && !detectOneHotEncoding) { + if (!containsEmptyConstOrIncompressable && !cs.oneHotDetect) { return co.coCodeColumns(colInfos, k); } else { @@ -89,7 +88,6 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress boolean isSample=false; if(est.getClass().getSimpleName().equals("ComEstSample")){ isSample=true; - LOG.info("isSampleTrue"); } for(int i = 0; i < colInfos.compressionInfo.size(); i++) { @@ -180,7 +178,6 @@ private static boolean containsEmptyConstOrIncompressable(CompressedSizeInfo col private static boolean isCandidate(CompressedSizeInfoColGroup g) { // Check if the column has exactly 2 distinct value other than 0 - LOG.info(g.getNumVals() + "-" + g.getNumRows() + "-" + g.getNumOffs()); return g.getNumVals() == 2; } @@ -190,7 +187,6 @@ private static boolean isHotEncoded(List colGroups, } int numCols = colGroups.size(); - LOG.info("numCols: " + numCols); int totalNumVals = 0; int totalNumOffs = 0; int numRows = colGroups.get(0).getNumRows(); @@ -200,7 +196,6 @@ private static boolean isHotEncoded(List colGroups, totalNumOffs += g.getNumOffs(); } - LOG.info("totalOffs: " + totalNumOffs + ", totalNumVals: " + totalNumVals); double margin = isSample ? 0.0007 : 0.0; if (totalNumVals / 2 != numCols || Math.abs(totalNumOffs - numRows) > numRows * margin) { return false; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index f482216cf54..226fcfc2650 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -285,7 +285,8 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { } else if(ct == CompressionType.OHE) { - return compressOHE(colIndexes,cg.getNumVals()); + boolean isSample = ce.getClass().getSimpleName().equals("ComEstSample"); + return compressOHE(colIndexes,cg,isSample); } final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs); @@ -346,38 +347,105 @@ else if(ct == CompressionType.OHE) { // } //new implementation - private AColGroup compressOHE(IColIndex colIndexes, int numVals) { + // private AColGroup compressOHE(IColIndex colIndexes, int numVals) { + // // Ensure numVals is valid + // if (numVals <= 0) { + // throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); + // } + + // AMapToData data = MapToFactory.create(cs.transposed ? in.getNumColumns() : in.getNumRows(), numVals); + + // if(cs.transposed) { + // // Handle transposed matrix + // for(int c = 0; c < in.getNumColumns(); c++){ + // for(int r = 0; r < colIndexes.size(); r++){ + // if(in.get(colIndexes.get(r), c) == 1){ + // data.set(c, r); + // break; + // } + // } + // } + // } else { + // // Handle non-transposed matrix + // for(int r = 0; r < in.getNumRows(); r++){ + // for(int c = 0; c < colIndexes.size(); c++){ + // if(in.get(r, colIndexes.get(c)) == 1){ + // data.set(r, c); + // break; + // } + // } + // } + // } + + // return ColGroupDDC.create(colIndexes, new IdentityDictionary(numVals), data, null); + // } + + private AColGroup compressOHE(IColIndex colIndexes, CompressedSizeInfoColGroup cg, boolean isSample) throws Exception { // Ensure numVals is valid + int numVals = cg.getNumVals(); if (numVals <= 0) { throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); } - AMapToData data = MapToFactory.create(cs.transposed ? in.getNumColumns() : in.getNumRows(), numVals); + AMapToData data; if(cs.transposed) { // Handle transposed matrix - for(int c = 0; c < in.getNumColumns(); c++){ - for(int r = 0; r < colIndexes.size(); r++){ - if(in.get(colIndexes.get(r), c) == 1){ - data.set(c, r); - break; - } + data = MapToFactory.create(cs.transposed ? in.getNumColumns() : in.getNumRows(), numVals+1); + SparseBlock sb = in.getSparseBlock(); + data.fill(numVals+1); + for(int c = 0; c cocodeCols = ColGroupFactory.compressColGroups(mb, cocodeGroups, cs, k); double actualCostCoCode = ce.getCost(cocodeCols, nRows); From 2330b6b704fd96646da4bc2dc45f04fc12d79514 Mon Sep 17 00:00:00 2001 From: Youssef Date: Mon, 15 Jul 2024 15:59:17 +0200 Subject: [PATCH 06/11] fix pom.xml --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0829025ccd7..144548a7103 100644 --- a/pom.xml +++ b/pom.xml @@ -77,7 +77,7 @@ false ** false - -Xms3g -Xmx3g -Xmn300m + -Xms3000m -Xmx3000m -Xmn300m false From 2645ca7db4a94086d203ef0e973ffdc2234fdad1 Mon Sep 17 00:00:00 2001 From: Youssef Date: Mon, 22 Jul 2024 18:23:49 +0200 Subject: [PATCH 07/11] Use nnzCols for sample and add experiments --- scripts/perftest/ohe_checks/README.md | 25 ++++++++ scripts/perftest/ohe_checks/experiment.dml | 27 ++++++++ scripts/perftest/ohe_checks/experiments.sh | 41 +++++++++++++ scripts/perftest/ohe_checks/ohe.xml | 3 + .../CompressedMatrixBlockFactory.java | 2 + .../compress/CompressionStatistics.java | 2 + .../compress/cocode/CoCoderFactory.java | 40 ++++++------ .../compress/colgroup/ColGroupFactory.java | 61 ------------------- .../transform/encode/MultiColumnEncoder.java | 1 + 9 files changed, 124 insertions(+), 78 deletions(-) create mode 100644 scripts/perftest/ohe_checks/README.md create mode 100644 scripts/perftest/ohe_checks/experiment.dml create mode 100644 scripts/perftest/ohe_checks/experiments.sh create mode 100644 scripts/perftest/ohe_checks/ohe.xml diff --git a/scripts/perftest/ohe_checks/README.md b/scripts/perftest/ohe_checks/README.md new file mode 100644 index 00000000000..4c72af02710 --- /dev/null +++ b/scripts/perftest/ohe_checks/README.md @@ -0,0 +1,25 @@ + + +# Checking One Hot Encoding before Compression tests + +To run all performance tests for SystemDS: + * install systemds, + * install the prerequisites, + * run experiments.sh \ No newline at end of file diff --git a/scripts/perftest/ohe_checks/experiment.dml b/scripts/perftest/ohe_checks/experiment.dml new file mode 100644 index 00000000000..1a310adfe75 --- /dev/null +++ b/scripts/perftest/ohe_checks/experiment.dml @@ -0,0 +1,27 @@ +## This script generates a random matrix, transforms some columns to be One-Hot-Encoded, and then compresses + +# Set default values +default_rows = 1000 +default_cols = 10 +default_dummy = "[1]" +default_repeats = 1 +default_num_distinct = 10 + +#nvargs +rows = ifdef($rows, default_rows) +cols = ifdef($cols, default_cols) +dummy = ifdef($dummy, default_dummy) +repeats = ifdef($repeats, default_repeats) +num_distinct = ifdef($distinct, default_num_distinct) + +# Generate random matrix and apply transformations +x = rand(rows=rows, cols=cols, min=0, max=num_distinct) +x = floor(x) +Fall = as.frame(x) +jspec = "{ids: true, dummycode: " + dummy + "}"; +for(i in 1:repeats){ + [T,M] = transformencode(target=Fall, spec=jspec) + # T[100,2]=1 + xc = compress(T) +} +print(toString(xc)) diff --git a/scripts/perftest/ohe_checks/experiments.sh b/scripts/perftest/ohe_checks/experiments.sh new file mode 100644 index 00000000000..86dfec1143d --- /dev/null +++ b/scripts/perftest/ohe_checks/experiments.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +mkdir BaselineLogs +mkdir OHELogs +run_base() { + systemds experiment.dml \ + --seed 42 --debug -nvargs rows=$1 cols=$2 dummy="$3" distinct=$4 > BaselineLogs/${5}_${1}_rows_${2}_cols_${3}_encoded_base.txt 2>&1 +} + +run_ohe() { + systemds experiment.dml \ + --seed 42 --debug --config ohe.xml -nvargs rows=$1 cols=$2 dummy="$3" distinct=$4> OHELogs/${5}_${1}_rows_${2}_cols_${3}_encoded_ohe.txt 2>&1 +} + +# Run same experiments but checking One-Hot-Encoded columns first +run_ohe 1000 1 "[1]" 10 1 +run_ohe 1000 5 "[2]" 10 2 +run_ohe 1000 5 "[1,2]" 10 3 +run_ohe 1000 5 "[1,2,3]" 10 4 +run_ohe 1000 5 "[1,2,3,4,5]" 10 5 +run_ohe 1000 10 "[1,3,5]" 10 6 +run_ohe 1000 10 "[1,2,5,6]" 10 7 +run_ohe 100000 1 "[1]" 100 8 +run_ohe 100000 5 "[1,2]" 100 9 +run_ohe 100000 5 "[1,2,3]" 100 10 +run_ohe 100000 100 "[1,3,50,60,70,80]" 100 11 +run_ohe 100000 100 "[1,2,24,25,50,51]" 100 12 + +# Run baseline experiments +run_base 1000 1 "[1]" 10 1 +run_base 1000 5 "[2]" 10 2 +run_base 1000 5 "[1,2]" 10 3 +run_base 1000 5 "[1,2,3]" 10 4 +run_base 1000 5 "[1,2,3,4,5]" 10 5 +run_base 1000 10 "[1,3,5]" 10 6 +run_base 1000 10 "[1,2,5,6]" 10 7 +run_base 100000 1 "[1]" 100 8 +run_base 100000 5 "[1,2]" 100 9 +run_base 100000 5 "[1,2,3]" 100 10 +run_base 100000 100 "[1,3,50,60,70,80]" 100 11 +run_base 100000 100 "[1,2,24,25,50,51]" 100 12 \ No newline at end of file diff --git a/scripts/perftest/ohe_checks/ohe.xml b/scripts/perftest/ohe_checks/ohe.xml new file mode 100644 index 00000000000..b5eaf4c9ca8 --- /dev/null +++ b/scripts/perftest/ohe_checks/ohe.xml @@ -0,0 +1,3 @@ + + true + diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java index b31ef4afddf..9aa934dfe7b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java @@ -285,6 +285,7 @@ else if(mb instanceof CompressedMatrixBlock && ((CompressedMatrixBlock) mb).isOv _stats.denseSize = MatrixBlock.estimateSizeInMemory(mb.getNumRows(), mb.getNumColumns(), 1.0); _stats.sparseSize = MatrixBlock.estimateSizeSparseInMemory(mb.getNumRows(), mb.getNumColumns(), mb.getSparsity()); + _stats.sparsity = mb.getSparsity(); _stats.originalSize = mb.getInMemorySize(); _stats.originalCost = costEstimator.getCost(mb); @@ -522,6 +523,7 @@ private void logPhase() { LOG.debug("--col groups sizes " + _stats.getGroupsSizesString()); LOG.debug("--input was compressed " + (mb instanceof CompressedMatrixBlock)); LOG.debug(String.format("--dense size: %16d", _stats.denseSize)); + LOG.debug(String.format("--sparsity: %4.3f", _stats.sparsity)); LOG.debug(String.format("--sparse size: %16d", _stats.sparseSize)); LOG.debug(String.format("--original size: %16d", _stats.originalSize)); LOG.debug(String.format("--compressed size: %16d", _stats.compressedSize)); diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java index d54eb2c3525..9708e0b19f7 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java @@ -46,6 +46,8 @@ public class CompressionStatistics { /** Compressed size */ public long compressedSize; + /** Sparsity of input matrix */ + public double sparsity; /** Cost calculated by the cost estimator on input */ public double originalCost = Double.NaN; /** Summed cost estimated from individual columns */ diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index ebd137e263c..e692e0fdfc5 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -85,11 +85,12 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress // filter groups List currentCandidates = new ArrayList<>(); List> oheGroups = new ArrayList<>(); - boolean isSample=false; - if(est.getClass().getSimpleName().equals("ComEstSample")){ - isSample=true; - } - + boolean isSample = est.getClass().getSimpleName().equals("ComEstSample"); + if(est.getNnzCols()==null) + LOG.debug("NNZ is null"); + else + LOG.debug("NNZ is not null"); + int[] nnzCols = est.getNnzCols(); for(int i = 0; i < colInfos.compressionInfo.size(); i++) { CompressedSizeInfoColGroup g = colInfos.compressionInfo.get(i); if(g.isEmpty()) @@ -100,7 +101,7 @@ else if(g.isIncompressable()) incompressable.add(g.getColumns()); else if (isCandidate(g)) { currentCandidates.add(g); - if (isHotEncoded(currentCandidates, isSample)) { + if (isHotEncoded(currentCandidates, isSample, nnzCols)) { oheGroups.add(new ArrayList<>(currentCandidates)); currentCandidates.clear(); } @@ -132,7 +133,6 @@ else if (isCandidate(g)) { } final IColIndex idx = ColIndexFactory.combineIndexes(oheIndexes); groups.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.OHE)); - // colInfos.compressionInfo.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.OHE)); } // cocode remaining groups @@ -181,7 +181,7 @@ private static boolean isCandidate(CompressedSizeInfoColGroup g) { return g.getNumVals() == 2; } - private static boolean isHotEncoded(List colGroups, boolean isSample) { + private static boolean isHotEncoded(List colGroups, boolean isSample, int[] nnzCols) { if (colGroups.isEmpty()) { return false; } @@ -190,23 +190,29 @@ private static boolean isHotEncoded(List colGroups, int totalNumVals = 0; int totalNumOffs = 0; int numRows = colGroups.get(0).getNumRows(); - - for (CompressedSizeInfoColGroup g : colGroups) { + + for (int i = 0; i < colGroups.size(); i++) { + CompressedSizeInfoColGroup g = colGroups.get(i); totalNumVals += g.getNumVals(); - totalNumOffs += g.getNumOffs(); + if(totalNumVals/2 > numCols) + return false; + // If sampling is used, get the number of non-zeroes from the nnzCols array + if (isSample && nnzCols!=null) { + totalNumOffs += nnzCols[i]; + } else { + totalNumOffs += g.getNumOffs(); + } } - - double margin = isSample ? 0.0007 : 0.0; - if (totalNumVals / 2 != numCols || Math.abs(totalNumOffs - numRows) > numRows * margin) { + + // Strict check without margin + if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { return false; } - // if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { - // return false; - // } LOG.info("ColGroup is OHE"); return true; } + private static AColumnCoCoder createColumnGroupPartitioner(PartitionerType type, AComEst est, ACostEstimate costEstimator, CompressionSettings cs) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 226fcfc2650..2887500a6da 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -29,7 +29,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; -import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; @@ -320,66 +319,6 @@ else if(ct == CompressionType.OHE) { } - // Old implementation - // private AColGroup compressOHE(IColIndex colIndexes, int numVals) { - // //There are some edge cases, can be optimized further - // // You have to make sure that it is actually OHE - // // Ensure numVals is valid - // if (numVals <= 0) { - // throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); - // } - - // // Check if the matrix is transposed - // if(cs.transposed) { - // throw new NotImplementedException("Not implemented"); - // } - - // AMapToData data = MapToFactory.create(in.getNumRows(), numVals); - // for(int r=0;r Date: Wed, 24 Jul 2024 01:16:11 +0200 Subject: [PATCH 08/11] Implementation fixes, formatting changes --- .../compress/cocode/CoCoderFactory.java | 119 ++++++++++-------- .../compress/colgroup/ColGroupFactory.java | 59 ++++----- 2 files changed, 93 insertions(+), 85 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index e692e0fdfc5..45c05912644 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -59,18 +59,18 @@ public enum PartitionerType { */ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, CompressedSizeInfo colInfos, int k, ACostEstimate costEstimator, CompressionSettings cs) { - + // Use column group partitioner to create partitions of columns AColumnCoCoder co = createColumnGroupPartitioner(cs.columnPartitioner, est, costEstimator, cs); - boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); - + boolean containsEmptyConstOrIncompressable = containsEmptyConstOrIncompressable(colInfos); + // If no empty, constant, incompressible groups and not OHE, cocode all columns - if (!containsEmptyConstOrIncompressable && !cs.oneHotDetect) { - return co.coCodeColumns(colInfos, k); + if(!containsEmptyConstOrIncompressable && !cs.oneHotDetect) { + return co.coCodeColumns(colInfos, k); } else { - + // filtered empty groups final List emptyCols = new ArrayList<>(); // filtered const groups @@ -84,12 +84,11 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress // filter groups List currentCandidates = new ArrayList<>(); - List> oheGroups = new ArrayList<>(); + List> oheGroups = new ArrayList<>(); + long startOheCheckTime = System.nanoTime(); boolean isSample = est.getClass().getSimpleName().equals("ComEstSample"); - if(est.getNnzCols()==null) + if(est.getNnzCols() == null) LOG.debug("NNZ is null"); - else - LOG.debug("NNZ is not null"); int[] nnzCols = est.getNnzCols(); for(int i = 0; i < colInfos.compressionInfo.size(); i++) { CompressedSizeInfoColGroup g = colInfos.compressionInfo.get(i); @@ -99,25 +98,35 @@ else if(g.isConst()) constCols.add(g.getColumns()); else if(g.isIncompressable()) incompressable.add(g.getColumns()); - else if (isCandidate(g)) { - currentCandidates.add(g); - if (isHotEncoded(currentCandidates, isSample, nnzCols)) { - oheGroups.add(new ArrayList<>(currentCandidates)); - currentCandidates.clear(); - } - } else { + else if(isCandidate(g)) { + currentCandidates.add(g); + String oheStatus = isHotEncoded(currentCandidates, isSample, nnzCols, nRow); + if(oheStatus.equals("NOT_OHE")) { + groups.addAll(currentCandidates); + currentCandidates.clear(); + } + else if(oheStatus.equals("VALID_OHE")) { + LOG.debug("FOUND OHE"); + oheGroups.add(new ArrayList<>(currentCandidates)); + currentCandidates.clear(); + } + } + else { groups.add(g); - if (!currentCandidates.isEmpty()) { - for(CompressedSizeInfoColGroup gg: currentCandidates) + if(!currentCandidates.isEmpty()) { + for(CompressedSizeInfoColGroup gg : currentCandidates) groups.add(gg); currentCandidates.clear(); - } - } + } + } } + long endOheCheckTime = System.nanoTime(); // End time for OHE checks + long durationOheCheckTime = endOheCheckTime - startOheCheckTime; + LOG.debug("OHE checks duration: " + durationOheCheckTime / 1e6 + " ms"); // If currentCandidates is not empty, add it to groups - if (!currentCandidates.isEmpty()) { - for (CompressedSizeInfoColGroup gg : currentCandidates) { + if(!currentCandidates.isEmpty()) { + for(CompressedSizeInfoColGroup gg : currentCandidates) { groups.add(gg); } currentCandidates.clear(); @@ -126,19 +135,20 @@ else if (isCandidate(g)) { // overwrite groups. colInfos.compressionInfo = groups; - for (List oheGroup : oheGroups) { - final List oheIndexes = new ArrayList<>(); - for (CompressedSizeInfoColGroup g : oheGroup) { - oheIndexes.add(g.getColumns()); - } - final IColIndex idx = ColIndexFactory.combineIndexes(oheIndexes); + for(List oheGroup : oheGroups) { + final List oheIndexes = new ArrayList<>(); + for(CompressedSizeInfoColGroup g : oheGroup) { + oheIndexes.add(g.getColumns()); + } + final IColIndex idx = ColIndexFactory.combineIndexes(oheIndexes); groups.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.OHE)); - } + } // cocode remaining groups - if(colInfos.getInfo().size()<=0 && incompressable.size()<=0 && emptyCols.size()<=0 && constCols.size()==0 && oheGroups.size()<=0) + if(colInfos.getInfo().size() <= 0 && incompressable.size() <= 0 && emptyCols.size() <= 0 && + constCols.size() == 0 && oheGroups.size() <= 0) throw new DMLCompressionException("empty cocoders 1"); - + if(!groups.isEmpty()) { colInfos = co.coCodeColumns(colInfos, k); } @@ -160,9 +170,9 @@ else if (isCandidate(g)) { colInfos.compressionInfo.add(new CompressedSizeInfoColGroup(idx, nRow, CompressionType.UNCOMPRESSED)); } - if(colInfos.getInfo().size()<=0) + if(colInfos.getInfo().size() <= 0) throw new DMLCompressionException("empty cocoders 2"); - + return colInfos; } @@ -178,41 +188,44 @@ private static boolean containsEmptyConstOrIncompressable(CompressedSizeInfo col private static boolean isCandidate(CompressedSizeInfoColGroup g) { // Check if the column has exactly 2 distinct value other than 0 - return g.getNumVals() == 2; + return(g.getNumVals() == 2); } - private static boolean isHotEncoded(List colGroups, boolean isSample, int[] nnzCols) { - if (colGroups.isEmpty()) { - return false; + private static String isHotEncoded(List colGroups, boolean isSample, int[] nnzCols, + int numRows) { + if(colGroups.isEmpty()) { + return "NOT_OHE"; } - + int numCols = colGroups.size(); int totalNumVals = 0; int totalNumOffs = 0; - int numRows = colGroups.get(0).getNumRows(); - for (int i = 0; i < colGroups.size(); i++) { + for(int i = 0; i < colGroups.size(); i++) { CompressedSizeInfoColGroup g = colGroups.get(i); totalNumVals += g.getNumVals(); - if(totalNumVals/2 > numCols) - return false; + if(totalNumVals / 2 > numCols) + return "NOT_OHE"; // If sampling is used, get the number of non-zeroes from the nnzCols array - if (isSample && nnzCols!=null) { + if(isSample && nnzCols != null) { totalNumOffs += nnzCols[i]; - } else { + } + else { totalNumOffs += g.getNumOffs(); } + if(totalNumOffs > numRows) { + return "NOT_OHE"; + } } - - // Strict check without margin - if (totalNumVals / 2 != numCols || totalNumOffs != numRows) { - return false; + + // Check if the current candidates form a valid OHE group + if((totalNumVals / 2) == numCols && totalNumOffs == numRows) { + return "VALID_OHE"; } - - LOG.info("ColGroup is OHE"); - return true; + + // If still under the row limit, it's potentially OHE + return "POTENTIAL_OHE"; } - private static AColumnCoCoder createColumnGroupPartitioner(PartitionerType type, AComEst est, ACostEstimate costEstimator, CompressionSettings cs) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 2887500a6da..ec1d6beefa5 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -227,12 +227,12 @@ private void logEstVsActual(double time, AColGroup act, CompressedSizeInfoColGro if(estC < actC * 0.75) { String warning = "The estimate cost is significantly off : " + est; LOG.debug( - String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s\n\t\t%s", time, - retType, estC, actC, act.getNumValues(), cols, wanted, warning)); + String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s\n\t\t%s", + time, retType, estC, actC, act.getNumValues(), cols, wanted, warning)); } else { - LOG.debug(String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", time, - retType, estC, actC, act.getNumValues(), cols, wanted)); + LOG.debug(String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", + time, retType, estC, actC, act.getNumValues(), cols, wanted)); } } @@ -285,7 +285,7 @@ else if(ct == CompressionType.SDC && colIndexes.size() == 1 && !t) { else if(ct == CompressionType.OHE) { boolean isSample = ce.getClass().getSimpleName().equals("ComEstSample"); - return compressOHE(colIndexes,cg,isSample); + return compressOHE(colIndexes, cg, isSample); } final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, in, cg.getNumVals(), cs); @@ -318,52 +318,49 @@ else if(ct == CompressionType.OHE) { } } - - private AColGroup compressOHE(IColIndex colIndexes, CompressedSizeInfoColGroup cg, boolean isSample) throws Exception { + private AColGroup compressOHE(IColIndex colIndexes, CompressedSizeInfoColGroup cg, boolean isSample) + throws Exception { // Ensure numVals is valid int numVals = cg.getNumVals(); - if (numVals <= 0) { + if(numVals <= 0) { throw new DMLCompressionException("Number of values must be greater than 0 for one-hot encoding"); } - + AMapToData data; - + if(cs.transposed) { // Handle transposed matrix - data = MapToFactory.create(cs.transposed ? in.getNumColumns() : in.getNumRows(), numVals+1); + data = MapToFactory.create(cs.transposed ? in.getNumColumns() : in.getNumRows(), numVals + 1); SparseBlock sb = in.getSparseBlock(); - data.fill(numVals+1); - for(int c = 0; c Date: Wed, 24 Jul 2024 16:47:09 +0200 Subject: [PATCH 09/11] Add documentation, parsing script, remove timing in CoCoderFactory --- scripts/perftest/ohe_checks/README.md | 134 +++++++++++++++++- scripts/perftest/ohe_checks/experiment.dml | 1 - scripts/perftest/ohe_checks/experiments.sh | 4 +- scripts/perftest/ohe_checks/parse_logs.py | 68 +++++++++ .../compress/cocode/CoCoderFactory.java | 4 - .../runtime/compress/estim/ComEstSample.java | 1 - 6 files changed, 200 insertions(+), 12 deletions(-) create mode 100644 scripts/perftest/ohe_checks/parse_logs.py diff --git a/scripts/perftest/ohe_checks/README.md b/scripts/perftest/ohe_checks/README.md index 4c72af02710..d52fe401ceb 100644 --- a/scripts/perftest/ohe_checks/README.md +++ b/scripts/perftest/ohe_checks/README.md @@ -17,9 +17,135 @@ limitations under the License. {% end comment %} --> -# Checking One Hot Encoding before Compression tests +# Checking One Hot Encodedness before Compression tests -To run all performance tests for SystemDS: +To run all tests for One Hot Encoding Checks: * install systemds, - * install the prerequisites, - * run experiments.sh \ No newline at end of file + * make sure that the paths for SYSTEMDS_ROOT, JAVA_HOME, HADOOP_HOME, LOG4JPROP are correctly set + * run experiments.sh + +Alternatively, to run the experiment.dml script directly with OHE checks enabled, use this command: + +`$SYSTEMDS_ROOT/bin/systemds $SYSTEMDS_ROOT/target/SystemDS.jar experiment.dml --config ohe.xml ` + +Note: You can use -nvargs to set the variables rows, cols, dummy, distinct, repeats (how many times you want to generate a random matrix, transform-encode it and compress it) + +(Dummy is the array of column indexes that you would like to One Hot Encode, example: dummy="[1]" will One Hot Encode the first column) + +To collect the metrics from the logs for easier comparison, you can run `parse_logs.py` and an excel file called `combined_metrics.xlsx` will be created in this directory. +--- +# Documentation of Changes to codebase for Implementing OHE Checks + +## Flag to enable/disable OHE checks (Disabled by default) +- Added ``COMPRESSED_ONEHOTDETECT = "sysds.compressed.onehotdetect"`` to ``DMLConfig`` and adjusted the relevant methods +- Added attribute to ``CompressionSettings`` ``public final boolean oneHotDetect`` and adjusted the methods +- Adjusted ``CompressionSettingsBuilder`` to check if ``COMPRESSED_ONEHOTDETECT`` has been set to true to enable the checks + +## Changes in `CoCoderFactory` + +### 1. Introduction of OHE Detection + +**Condition Addition:** +- Added a condition to check for `cs.oneHotDetect` along with the existing condition `!containsEmptyConstOrIncompressable` in the `findCoCodesByPartitioning` method. This ensures that the process considers OHE detection only if it is enabled in the compression settings. +- Original code only checked for `containsEmptyConstOrIncompressable` and proceeded to cocode all columns if false. The updated code includes an additional check for `cs.oneHotDetect`. + +### 2. New Data Structures for OHE Handling + +**New Lists:** Introduced two new lists to manage the OHE detection process: +- `currentCandidates`: To store the current candidate columns that might form an OHE group. +- `oheGroups`: To store lists of columns that have been validated as OHE groups. + +### 3. Filtering Logic Enhancements + +**Column Filtering:** Enhanced the loop that iterates over columns to identify OHE candidates: +- Columns that are empty, constant, or incompressible are filtered into respective lists. +- For other columns, they are added to `currentCandidates` if they are deemed candidates (via `isCandidate` function). + +### 4. Addition of `isHotEncoded` Function + +**Function Creation:** Created a new `isHotEncoded` function to evaluate if the accumulated columns form a valid OHE group. +- **Parameters:** Takes a list of column groups (`colGroups`), a boolean flag (`isSample`), an array of non-zero counts (`nnzCols`), and the number of rows (`numRows`). +- **Return Type:** Returns a `String` indicating the status of the current candidates: + - `"POTENTIAL_OHE"`: When the current candidates could still form an OHE group. + - `"NOT_OHE"`: When the current candidates cannot form an OHE group. + - `"VALID_OHE"`: When the current candidates form a valid OHE group. +- **Logic:** The function calculates the total number of distinct values and offsets, and checks if they meet the criteria for forming an OHE group. + +### 5. Enhanced Group Handling + +**Candidate Processing:** Within the loop, after adding a column to `currentCandidates`: +- Calls `isHotEncoded` to check the status of the candidates. +- If `isHotEncoded` returns `"NOT_OHE"`, moves the candidates to regular groups and clears the candidates list. +- If `isHotEncoded` returns `"VALID_OHE"`, moves the candidates to `oheGroups` and clears the candidates list. +- If `isHotEncoded` returns `"POTENTIAL_OHE"`, continues accumulating candidates. + +### 6. Final Candidate Check + +**Post-loop Check:** After the loop, checks any remaining `currentCandidates`: +- If they form a valid OHE group, adds them to `oheGroups`. +- Otherwise, adds them to regular groups. + +### 7. Overwrite and CoCode Groups + +**Overwrite Groups:** Updates `colInfos.compressionInfo` with the processed `groups`. +**OHE Group Integration:** Combines indexes for validated OHE groups and adds them to the final `groups`. + +## One Hot Encoded Columns Compression in `ColGroupFactory` + +### Description + +The `compressOHE` function is designed to compress columns that are one-hot encoded (OHE). It validates and processes the input data to ensure it meets the criteria for one-hot encoding, and if so, it compresses the data accordingly. If the data does not meet the OHE criteria, it falls back to a direct compression method (`directCompressDDC`). + +### Implementation Details + +1. **Validation of `numVals`**: + - Ensures the number of distinct values (`numVals`) in the column group is greater than 0. + - Throws a `DMLCompressionException` if `numVals` is less than or equal to 0. + +2. **Handling Transposed Matrix**: + - If the matrix is transposed (`cs.transposed` is `true`): + - Creates a `MapToFactory` data structure with an additional unique value. + - Iterates through the sparse block of the matrix, checking for non-one values or multiple ones in the same row. + - If a column index in the sparse block is empty, or if non-one values or multiple ones are found, it falls back to `directCompressDDC`. + +3. **Handling Non-Transposed Matrix**: + - If the matrix is not transposed (`cs.transposed` is `false`): + - Creates a `MapToFactory` data structure. + - Iterates through each row of the matrix: + - Checks for the presence of exactly one '1' in the columns specified by `colIndexes`. + - If multiple ones are found in the same row, or if no '1' is found in a sample row, it falls back to `directCompressDDC`. + +4. **Return Value**: + - If the data meets the OHE criteria, returns a `ColGroupDDC` created with the column indexes, an `IdentityDictionary`, and the data. + - If the data does not meet the OHE criteria, returns the result of `directCompressDDC`. + +## Add method in `ColGroupSizes` +Added method ``estimateInMemorySizeOHE(int nrColumns, boolean contiguousColumns, int nrRows)`` + +## Add method in `AComEst` +Added a getter method `getNnzCols` + +## Edit `distinctCountScale` method in `ComEstSample` +```java +if(freq == null || freq.length == 0) + return numOffs+1; +``` +And added condition: +```java +if(sampleFacts.numRows>sampleFacts.numOffs) + est += 1; +``` +Warning: This Change will cause some tests to fail. + + +## Edit constructor in `CompressedSizeInfoColGroup` +Added a case in switch statement for OHE + +## Added attribute in `CompressionStatistics` +Added Sparsity of input matrix attribute ``public double sparsity;`` to add logging in ``CompressedMatrixBlockFactory`` +## Fix Bug in `extractFacts` method in `SparseEncoding` +Number of distinct values returned was wrong. +Fix: In the return statements, changed map.getUnique() to getUnique() + +## Fix Bug in `outputMatrixPostProcessing` method in `MultiColumnEncoder` +Instead of just recomputing nonzeroes in the else block, added `output.examSparsity(k);` diff --git a/scripts/perftest/ohe_checks/experiment.dml b/scripts/perftest/ohe_checks/experiment.dml index 1a310adfe75..066f2158aba 100644 --- a/scripts/perftest/ohe_checks/experiment.dml +++ b/scripts/perftest/ohe_checks/experiment.dml @@ -21,7 +21,6 @@ Fall = as.frame(x) jspec = "{ids: true, dummycode: " + dummy + "}"; for(i in 1:repeats){ [T,M] = transformencode(target=Fall, spec=jspec) - # T[100,2]=1 xc = compress(T) } print(toString(xc)) diff --git a/scripts/perftest/ohe_checks/experiments.sh b/scripts/perftest/ohe_checks/experiments.sh index 86dfec1143d..3307df61452 100644 --- a/scripts/perftest/ohe_checks/experiments.sh +++ b/scripts/perftest/ohe_checks/experiments.sh @@ -3,12 +3,12 @@ mkdir BaselineLogs mkdir OHELogs run_base() { - systemds experiment.dml \ + $SYSTEMDS_ROOT/bin/systemds $SYSTEMDS_ROOT/target/SystemDS.jar experiment.dml \ --seed 42 --debug -nvargs rows=$1 cols=$2 dummy="$3" distinct=$4 > BaselineLogs/${5}_${1}_rows_${2}_cols_${3}_encoded_base.txt 2>&1 } run_ohe() { - systemds experiment.dml \ + $SYSTEMDS_ROOT/bin/systemds $SYSTEMDS_ROOT/target/SystemDS.jar experiment.dml \ --seed 42 --debug --config ohe.xml -nvargs rows=$1 cols=$2 dummy="$3" distinct=$4> OHELogs/${5}_${1}_rows_${2}_cols_${3}_encoded_ohe.txt 2>&1 } diff --git a/scripts/perftest/ohe_checks/parse_logs.py b/scripts/perftest/ohe_checks/parse_logs.py new file mode 100644 index 00000000000..77c914b78a1 --- /dev/null +++ b/scripts/perftest/ohe_checks/parse_logs.py @@ -0,0 +1,68 @@ +import os +import re +import pandas as pd + +# Patterns to search for +patterns = { + "num_col_groups": re.compile(r"--num col groups:\s+(\d+)"), + "compressed_size": re.compile(r"--compressed size:\s+(\d+)"), + "compression_ratio": re.compile(r"--compression ratio:\s+([\d.]+)"), + "execution_time": re.compile(r"Total execution time:\s+([\d.]+) sec.") +} + +# Function to extract metrics from the text files +def extract_metrics(file_path): + with open(file_path, 'r') as file: + content = file.read() + metrics = {} + for key, pattern in patterns.items(): + match = pattern.search(content) + if match: + metrics[key] = match.group(1) + else: + metrics[key] = None + return metrics + +# Directories for baseline and OHE +baseline_dir = "BaselineLogs" +ohe_dir = "OHELogs" + +# Data storage +data_combined = [] + +# Process baseline and corresponding OHE files +for file_name in os.listdir(baseline_dir): + if file_name.endswith("_encoded_base.txt"): + experiment_name = file_name[:-4] # Remove the .txt extension + file_path_baseline = os.path.join(baseline_dir, file_name) + metrics_baseline = extract_metrics(file_path_baseline) + + file_name_ohe = file_name.replace('_base.txt', '_ohe.txt').replace('Baseline', 'OHE') + file_path_ohe = os.path.join(ohe_dir, file_name_ohe) + if os.path.exists(file_path_ohe): + metrics_ohe = extract_metrics(file_path_ohe) + else: + metrics_ohe = {key: None for key in patterns.keys()} + + combined_metrics = { + 'experiment': experiment_name, + 'baseline_num_col_groups': metrics_baseline.get('num_col_groups'), + 'baseline_compressed_size': metrics_baseline.get('compressed_size'), + 'baseline_compression_ratio': metrics_baseline.get('compression_ratio'), + 'baseline_execution_time': metrics_baseline.get('execution_time'), + 'ohe_num_col_groups': metrics_ohe.get('num_col_groups'), + 'ohe_compressed_size': metrics_ohe.get('compressed_size'), + 'ohe_compression_ratio': metrics_ohe.get('compression_ratio'), + 'ohe_execution_time': metrics_ohe.get('execution_time') + } + + data_combined.append(combined_metrics) + +# Create DataFrame +df_combined = pd.DataFrame(data_combined) + +# Write to Excel +output_file = 'combined_metrics.xlsx' +df_combined.to_excel(output_file, index=False) + +print(f'Excel file "{output_file}" has been created successfully.') diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java index 45c05912644..4ed7e0fbe25 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCoderFactory.java @@ -85,7 +85,6 @@ public static CompressedSizeInfo findCoCodesByPartitioning(AComEst est, Compress // filter groups List currentCandidates = new ArrayList<>(); List> oheGroups = new ArrayList<>(); - long startOheCheckTime = System.nanoTime(); boolean isSample = est.getClass().getSimpleName().equals("ComEstSample"); if(est.getNnzCols() == null) LOG.debug("NNZ is null"); @@ -120,9 +119,6 @@ else if(oheStatus.equals("VALID_OHE")) { } } } - long endOheCheckTime = System.nanoTime(); // End time for OHE checks - long durationOheCheckTime = endOheCheckTime - startOheCheckTime; - LOG.debug("OHE checks duration: " + durationOheCheckTime / 1e6 + " ms"); // If currentCandidates is not empty, add it to groups if(!currentCandidates.isEmpty()) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/ComEstSample.java b/src/main/java/org/apache/sysds/runtime/compress/estim/ComEstSample.java index f25e3150fa0..42fef705e68 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/ComEstSample.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/ComEstSample.java @@ -173,7 +173,6 @@ private int distinctCountScale(EstimationFactors sampleFacts, int numOffs, int n // Bound the estimate with the maxDistinct. if(sampleFacts.numRows>sampleFacts.numOffs) est += 1; - // LOG.error(sampleFacts.numOffs + " - " + sampled, null); return Math.max(Math.min(est, Math.min(maxDistinct, numOffs)), 1); } From d2448e0a43993a57590686300350cfa9935541b6 Mon Sep 17 00:00:00 2001 From: Youssef Date: Fri, 26 Jul 2024 12:43:30 +0200 Subject: [PATCH 10/11] Added licenses to new files --- conf/log4j-compression.properties | 21 +++++++++++++++++++++ scripts/perftest/ohe_checks/experiment.dml | 21 +++++++++++++++++++++ scripts/perftest/ohe_checks/experiments.sh | 22 +++++++++++++++++++++- scripts/perftest/ohe_checks/ohe.xml | 19 +++++++++++++++++++ scripts/perftest/ohe_checks/parse_logs.py | 22 ++++++++++++++++++++++ 5 files changed, 104 insertions(+), 1 deletion(-) diff --git a/conf/log4j-compression.properties b/conf/log4j-compression.properties index 2f251773da4..0bafbe31d20 100644 --- a/conf/log4j-compression.properties +++ b/conf/log4j-compression.properties @@ -1,3 +1,24 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + log4j.rootLogger=ERROR, console log4j.logger.org.apache.sysds=INFO diff --git a/scripts/perftest/ohe_checks/experiment.dml b/scripts/perftest/ohe_checks/experiment.dml index 066f2158aba..c89f7dcaef6 100644 --- a/scripts/perftest/ohe_checks/experiment.dml +++ b/scripts/perftest/ohe_checks/experiment.dml @@ -1,3 +1,24 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + ## This script generates a random matrix, transforms some columns to be One-Hot-Encoded, and then compresses # Set default values diff --git a/scripts/perftest/ohe_checks/experiments.sh b/scripts/perftest/ohe_checks/experiments.sh index 3307df61452..cf252c39fe1 100644 --- a/scripts/perftest/ohe_checks/experiments.sh +++ b/scripts/perftest/ohe_checks/experiments.sh @@ -1,4 +1,24 @@ -#!/bin/bash +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- mkdir BaselineLogs mkdir OHELogs diff --git a/scripts/perftest/ohe_checks/ohe.xml b/scripts/perftest/ohe_checks/ohe.xml index b5eaf4c9ca8..35c3e481a07 100644 --- a/scripts/perftest/ohe_checks/ohe.xml +++ b/scripts/perftest/ohe_checks/ohe.xml @@ -1,3 +1,22 @@ + + true diff --git a/scripts/perftest/ohe_checks/parse_logs.py b/scripts/perftest/ohe_checks/parse_logs.py index 77c914b78a1..edd27a6e7e4 100644 --- a/scripts/perftest/ohe_checks/parse_logs.py +++ b/scripts/perftest/ohe_checks/parse_logs.py @@ -1,3 +1,25 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + import os import re import pandas as pd From cb7534b99b9e79c83a76fa053fc8afae491b4ec5 Mon Sep 17 00:00:00 2001 From: Youssef Date: Tue, 30 Jul 2024 13:49:11 +0200 Subject: [PATCH 11/11] Minor cleanups --- scripts/perftest/ohe_checks/README.md | 34 ++----------------- scripts/perftest/ohe_checks/experiments.sh | 2 +- .../ohe_checks}/log4j-compression.properties | 2 +- .../compress/estim/CompressedSizeInfo.java | 4 ++- 4 files changed, 7 insertions(+), 35 deletions(-) rename {conf => scripts/perftest/ohe_checks}/log4j-compression.properties (98%) diff --git a/scripts/perftest/ohe_checks/README.md b/scripts/perftest/ohe_checks/README.md index d52fe401ceb..bdf0df4eaa4 100644 --- a/scripts/perftest/ohe_checks/README.md +++ b/scripts/perftest/ohe_checks/README.md @@ -21,7 +21,8 @@ limitations under the License. To run all tests for One Hot Encoding Checks: * install systemds, - * make sure that the paths for SYSTEMDS_ROOT, JAVA_HOME, HADOOP_HOME, LOG4JPROP are correctly set + * make sure that the paths for SYSTEMDS_ROOT, JAVA_HOME, HADOOP_HOME are correctly set + * set the path for LOG4JPROP to `$SYSTEMDS_ROOT/scripts/perftest/ohe_checks/log4j-compression.properties` * run experiments.sh Alternatively, to run the experiment.dml script directly with OHE checks enabled, use this command: @@ -118,34 +119,3 @@ The `compressOHE` function is designed to compress columns that are one-hot enco 4. **Return Value**: - If the data meets the OHE criteria, returns a `ColGroupDDC` created with the column indexes, an `IdentityDictionary`, and the data. - If the data does not meet the OHE criteria, returns the result of `directCompressDDC`. - -## Add method in `ColGroupSizes` -Added method ``estimateInMemorySizeOHE(int nrColumns, boolean contiguousColumns, int nrRows)`` - -## Add method in `AComEst` -Added a getter method `getNnzCols` - -## Edit `distinctCountScale` method in `ComEstSample` -```java -if(freq == null || freq.length == 0) - return numOffs+1; -``` -And added condition: -```java -if(sampleFacts.numRows>sampleFacts.numOffs) - est += 1; -``` -Warning: This Change will cause some tests to fail. - - -## Edit constructor in `CompressedSizeInfoColGroup` -Added a case in switch statement for OHE - -## Added attribute in `CompressionStatistics` -Added Sparsity of input matrix attribute ``public double sparsity;`` to add logging in ``CompressedMatrixBlockFactory`` -## Fix Bug in `extractFacts` method in `SparseEncoding` -Number of distinct values returned was wrong. -Fix: In the return statements, changed map.getUnique() to getUnique() - -## Fix Bug in `outputMatrixPostProcessing` method in `MultiColumnEncoder` -Instead of just recomputing nonzeroes in the else block, added `output.examSparsity(k);` diff --git a/scripts/perftest/ohe_checks/experiments.sh b/scripts/perftest/ohe_checks/experiments.sh index cf252c39fe1..2e6e2bd2786 100644 --- a/scripts/perftest/ohe_checks/experiments.sh +++ b/scripts/perftest/ohe_checks/experiments.sh @@ -58,4 +58,4 @@ run_base 100000 1 "[1]" 100 8 run_base 100000 5 "[1,2]" 100 9 run_base 100000 5 "[1,2,3]" 100 10 run_base 100000 100 "[1,3,50,60,70,80]" 100 11 -run_base 100000 100 "[1,2,24,25,50,51]" 100 12 \ No newline at end of file +run_base 100000 100 "[1,2,24,25,50,51]" 100 12 diff --git a/conf/log4j-compression.properties b/scripts/perftest/ohe_checks/log4j-compression.properties similarity index 98% rename from conf/log4j-compression.properties rename to scripts/perftest/ohe_checks/log4j-compression.properties index 0bafbe31d20..5f7c1cd70b5 100644 --- a/conf/log4j-compression.properties +++ b/scripts/perftest/ohe_checks/log4j-compression.properties @@ -30,4 +30,4 @@ log4j.logger.org.apache.hadoop=ERROR log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n \ No newline at end of file +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java index 83e2d6ab70e..ed2018a7b95 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java +++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java @@ -25,6 +25,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; +import org.apache.sysds.runtime.compress.DMLCompressionException; /** * A helper reusable object for maintaining information about estimated compression @@ -96,7 +97,8 @@ public String getEstimatedDistinct() { if(compressionInfo == null) return ""; if(compressionInfo.size()<=0) - return ""; + throw new DMLCompressionException("Size of compression info is <= 0"); + sb.append("["); sb.append(compressionInfo.get(0).getNumVals()); for(int i = 1; i < compressionInfo.size(); i++)