From 137cf2043b7fb35569f31b94cf7d22e7a64a5953 Mon Sep 17 00:00:00 2001 From: Paulo-Mansano Date: Sun, 29 Jun 2025 21:38:26 +0200 Subject: [PATCH] New classes: ColumnEncoderRagged, TransformDummySeparatedTest, TransformRaggedTest. Modified: TfUtils, ColumnEncoder, EncoderFactory, TransformCustomTest --- hello.dml | 1 + .../sysds/runtime/transform/TfUtils.java | 2 +- .../transform/encode/ColumnEncoder.java | 4 +- .../transform/encode/ColumnEncoderRagged.java | 197 ++++++++++++++++ .../transform/encode/EncoderFactory.java | 11 +- .../frame/transform/TransformCustomTest.java | 5 +- .../TransformDummySeparatedTest.java | 74 ++++++ .../frame/transform/TransformRaggedTest.java | 218 ++++++++++++++++++ 8 files changed, 505 insertions(+), 7 deletions(-) create mode 100644 hello.dml create mode 100644 src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformDummySeparatedTest.java create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformRaggedTest.java diff --git a/hello.dml b/hello.dml new file mode 100644 index 00000000000..03be2397d3c --- /dev/null +++ b/hello.dml @@ -0,0 +1 @@ +print("Hello SystemDS") diff --git a/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java b/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java index 67ee1776f1c..cbeb4cfa360 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java +++ b/src/main/java/org/apache/sysds/runtime/transform/TfUtils.java @@ -47,7 +47,7 @@ protected byte toID() { //transform methods public enum TfMethod { - IMPUTE, RECODE, HASH, BIN, DUMMYCODE, UDF, OMIT, WORD_EMBEDDING, BAG_OF_WORDS; + IMPUTE, RECODE, HASH, BIN, DUMMYCODE, UDF, OMIT, WORD_EMBEDDING, BAG_OF_WORDS, RAGGED; @Override public String toString() { return name().toLowerCase(); diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java index 037e7bea1d7..0ac6ec9f924 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoder.java @@ -75,7 +75,7 @@ public void initEmbeddings(MatrixBlock embeddings){ } protected enum TransformType{ - BIN, RECODE, DUMMYCODE, FEATURE_HASH, PASS_THROUGH, UDF, WORD_EMBEDDING, BAG_OF_WORDS, N_A + BIN, RECODE, DUMMYCODE, FEATURE_HASH, PASS_THROUGH, UDF, WORD_EMBEDDING, BAG_OF_WORDS, RAGGED, N_A } protected ColumnEncoder(int colID) { @@ -447,7 +447,7 @@ protected void setApplyRowBlocksPerColumn(int nPart) { } public enum EncoderType { - Recode, FeatureHash, PassThrough, Bin, Dummycode, Omit, MVImpute, Composite, WordEmbedding, BagOfWords + Recode, FeatureHash, PassThrough, Bin, Dummycode, Omit, MVImpute, Composite, WordEmbedding, BagOfWords, Ragged } /* diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java new file mode 100644 index 00000000000..a326c71af73 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderRagged.java @@ -0,0 +1,197 @@ +package org.apache.sysds.runtime.transform.encode; + +import org.apache.sysds.runtime.controlprogram.caching.CacheBlock; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.TfUtils; +import org.apache.sysds.runtime.transform.TfUtils.TfMethod; +import org.apache.sysds.runtime.util.UtilFunctions; +import org.apache.sysds.common.Types.ValueType; + +import java.util.HashMap; +import java.util.Map; + +/** + * Encodes a column using ragged array/dictionary representation to optimize memory usage. + * Stores unique values in a dictionary and replaces occurrences with indices. + */ +public class ColumnEncoderRagged extends ColumnEncoder { + private static final long serialVersionUID = 2291732648968734088L; + + // Dictionary storage + private Object[] _dict; + private int _dictSize; + private int _nullIndex = -1; + + // Reverse mapping for fast lookups + private transient Map _valueToIndex; + + private static final String[] DEFAULT_NA_STRINGS = new String[]{"NA", "NaN", ""}; + + public ColumnEncoderRagged() { + super(-1); // ID will be set during construction + } + + public ColumnEncoderRagged(int colID) { + super(colID); + } + + @Override + protected TransformType getTransformType() { + return TransformType.RAGGED; + } + + // Helper method to check NA values + private boolean isNAValue(String val) { + if(val == null) return true; + for(String na : DEFAULT_NA_STRINGS) { + if(val.equals(na)) return true; + } + return false; + } + + @Override + public void build(CacheBlock in) { + if (!(in instanceof FrameBlock)) + throw new IllegalArgumentException("Ragged encoding only supports FrameBlock input"); + + FrameBlock fin = (FrameBlock) in; + if (_colID < 1 || _colID > fin.getNumColumns()) + throw new IllegalArgumentException("Invalid column ID: " + _colID); + + _valueToIndex = new HashMap<>(); + _dict = new String[Math.min(1024, fin.getNumRows())]; + _dictSize = 0; + + for (int i = 0; i < fin.getNumRows(); i++) { + Object valObj = fin.get(i, _colID - 1); + // Convert all values to strings safely + String val = (valObj != null) ? valObj.toString() : null; + + if (isNAValue(val)) { + if (_nullIndex == -1) { + _nullIndex = _dictSize; + _dict[_dictSize++] = null; + } + continue; + } + + if (!_valueToIndex.containsKey(val)) { + if (_dictSize == _dict.length) { + String[] newDict = new String[_dict.length * 2]; + System.arraycopy(_dict, 0, newDict, 0, _dictSize); + _dict = newDict; + } + _dict[_dictSize] = val; + _valueToIndex.put(val, _dictSize); + _dictSize++; + } + } + } + + @Override +public MatrixBlock apply(CacheBlock in, MatrixBlock out, int outputCol) { + // Validate input type + if (!(in instanceof FrameBlock)) { + throw new IllegalArgumentException("Ragged encoding only supports FrameBlock input"); + } + + FrameBlock fin = (FrameBlock) in; + final int numRows = fin.getNumRows(); + + // Create new matrix if needed + if (out == null) { + out = new MatrixBlock(numRows, outputCol + 1, false); + } + + // Encode each value + for (int i = 0; i < numRows; i++) { + String val = fin.get(i, _colID - 1).toString(); + int index = isNAValue(val) ? _nullIndex : _valueToIndex.getOrDefault(val, _nullIndex); + + // Use the standard set method + out.set(i, outputCol, (double) index); + } + + return out; +} + + @Override + public double[] getCodeCol(CacheBlock in, int outputCol, int rowStart, double[] tmp) { + if (!(in instanceof FrameBlock)) + throw new IllegalArgumentException("Ragged encoding only supports FrameBlock input"); + FrameBlock fin = (FrameBlock) in; + + if (tmp == null) + tmp = new double[fin.getNumRows() - rowStart]; + + for (int i = rowStart; i < fin.getNumRows(); i++) { + String val = fin.get(i, _colID - 1).toString(); + tmp[i - rowStart] = isNAValue(val) ? _nullIndex : _valueToIndex.getOrDefault(val, _nullIndex); + } + return tmp; + } + + @Override + public double getCode(CacheBlock in, int row) { + if (!(in instanceof FrameBlock)) + throw new IllegalArgumentException("Ragged encoding only supports FrameBlock input"); + FrameBlock fin = (FrameBlock) in; + + String val = fin.get(row, _colID - 1).toString(); + return isNAValue(val) ? _nullIndex : _valueToIndex.getOrDefault(val, _nullIndex); + } + + @Override + public FrameBlock getMetaData(FrameBlock out) { + if (out == null) + out = new FrameBlock(1, ValueType.STRING); + + // Store dictionary in meta frame + out.ensureAllocatedColumns(_dictSize); + for (int i = 0; i < _dictSize; i++) { + out.set(i, 0, _dict[i]); + } + + return out; + } + + @Override + public void initMetaData(FrameBlock meta) { + if (meta == null || meta.getNumRows() == 0) + return; + + // Reconstruct dictionary from meta data + _dictSize = meta.getNumRows(); + _dict = new Object[_dictSize]; + _valueToIndex = new HashMap<>(); + + for (int i = 0; i < _dictSize; i++) { + _dict[i] = meta.get(i, 0); + if (_dict[i] == null) { + _nullIndex = i; + } else { + _valueToIndex.put(_dict[i], i); + } + } + } + + // Other required methods with default implementations + @Override public void allocateMetaData(FrameBlock meta) {} + @Override public void prepareBuildPartial() {} + @Override public void buildPartial(FrameBlock in) { build(in); } + @Override public void updateIndexRanges(long[] beginDims, long[] endDims, int offset) {} + + // Additional helper methods + public Object[] getDictionary() { + return _dict; + } + + public int getDictionarySize() { + return _dictSize; + } + + public int getNullIndex() { + return _nullIndex; + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java index 1294d0e7e79..134182abbde 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java @@ -120,13 +120,15 @@ public static MultiColumnEncoder createEncoder(String spec, String[] colnames, i .toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfMethod.WORD_EMBEDDING.toString(), minCol, maxCol))); List bowIDs = Arrays.asList(ArrayUtils .toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfMethod.BAG_OF_WORDS.toString(), minCol, maxCol))); + List ragIDs = Arrays.asList(ArrayUtils + .toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfMethod.RAGGED.toString(), minCol, maxCol))); // NOTE: any dummycode column requires recode as preparation, unless the dummycode // column follows binning or feature hashing rcIDs = unionDistinct(rcIDs, except(except(dcIDs, binIDs), haIDs)); // Error out if the first level encoders have overlaps - if (intersect(rcIDs, binIDs, haIDs, weIDs, bowIDs)) - throw new DMLRuntimeException("More than one encoders (recode, binning, hashing, word_embedding, bag_of_words) on one column is not allowed:\n" + spec); + if (intersect(rcIDs, binIDs, haIDs, weIDs, bowIDs, ragIDs)) + throw new DMLRuntimeException("More than one encoders (recode, binning, hashing, word_embedding, bag_of_words, ragIDs) on one column is not allowed:\n" + spec); List ptIDs = except(UtilFunctions.getSeqList(1, clen, 1), naryUnionDistinct(rcIDs, haIDs, binIDs, weIDs, bowIDs)); List oIDs = new ArrayList<>(Arrays.asList(ArrayUtils @@ -158,6 +160,9 @@ public static MultiColumnEncoder createEncoder(String spec, String[] colnames, i if(!weIDs.isEmpty()) for(Integer id : weIDs) addEncoderToMap(new ColumnEncoderWordEmbedding(id), colEncoders); + if(!ragIDs.isEmpty()) + for(Integer id : ragIDs) + addEncoderToMap(new ColumnEncoderRagged(id), colEncoders); if(!bowIDs.isEmpty()) for(Integer id : bowIDs) addEncoderToMap(new ColumnEncoderBagOfWords(id), colEncoders); @@ -287,6 +292,8 @@ public static ColumnEncoder createInstance(int type) { return new ColumnEncoderWordEmbedding(); case BagOfWords: return new ColumnEncoderBagOfWords(); + case Ragged: + return new ColumnEncoderRagged(); default: throw new DMLRuntimeException("Unsupported encoder type: " + etype); } diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java index 92f074c20c9..773d77fd57e 100644 --- a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java @@ -97,9 +97,10 @@ public void test(String spec) { try { FrameBlock meta = null; - MultiColumnEncoder encoder = EncoderFactory.createEncoder(spec, data.getColumnNames(), data.getNumColumns(), - meta); + MultiColumnEncoder encoder = EncoderFactory.createEncoder(spec, data.getColumnNames(), data.getNumColumns(), meta); MatrixBlock out = encoder.encode(data); + meta = encoder.getMetaData(meta); //I added this just to have the frame stored somewhere + System.out.println(meta); MatrixBlock out2 = encoder.apply(data); TestUtils.compareMatrices(out, out2, 0, "Not Equal after apply"); diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformDummySeparatedTest.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformDummySeparatedTest.java new file mode 100644 index 00000000000..e7cf3e954bd --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformDummySeparatedTest.java @@ -0,0 +1,74 @@ +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.NotImplementedException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.ColumnEncoder; +import org.apache.sysds.runtime.transform.encode.ColumnEncoderComposite; +import org.apache.sysds.runtime.transform.encode.ColumnEncoderDummycode; +import org.apache.sysds.runtime.transform.encode.ColumnEncoderPassThrough; +import org.apache.sysds.runtime.transform.encode.CompressedEncode; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +public class TransformDummySeparatedTest extends AutomatedTestBase { + protected static final Log LOG = LogFactory.getLog(TransformDummySeparatedTest.class.getName()); + + final FrameBlock data; + + public TransformDummySeparatedTest() { + data = TestUtils.generateRandomFrameBlock(100, new org.apache.sysds.common.Types.ValueType[] { + org.apache.sysds.common.Types.ValueType.UINT8 }, 231); + data.setSchema(new org.apache.sysds.common.Types.ValueType[] { + org.apache.sysds.common.Types.ValueType.INT32 }); + } + + @Test + public void testDummySeparatedBasic() { + + test("{ids:true, dummycode:[1]}"); + + } + + public void test(String spec) { + try { + FrameBlock meta = null; + MultiColumnEncoder encoder = EncoderFactory.createEncoder(spec, data.getColumnNames(), data.getNumColumns(), meta); + + MatrixBlock out = encoder.encode(data); + meta = encoder.getMetaData(new FrameBlock(data.getNumColumns(), org.apache.sysds.common.Types.ValueType.STRING)); + MatrixBlock out2 = encoder.apply(data); + + // Compare consistency + TestUtils.compareMatrices(out, out2, 0, "Not Equal after apply"); + + // Print output + System.out.println("== Encoded MatrixBlock =="); + System.out.println(out.toString()); + + System.out.println("== Metadata FrameBlock =="); + System.out.println(meta.toString()); + + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + + @Override + public void setUp() { + // TODO Auto-generated method stub + //throw new UnsupportedOperationException("Unimplemented method 'setUp'"); + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformRaggedTest.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformRaggedTest.java new file mode 100644 index 00000000000..a03fca54ccf --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformRaggedTest.java @@ -0,0 +1,218 @@ +package org.apache.sysds.test.component.frame.transform; + +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.ColumnEncoder; +import org.apache.sysds.runtime.transform.encode.ColumnEncoderRagged; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TransformRaggedTest { + protected static final Log LOG = LogFactory.getLog(TransformRaggedTest.class.getName()); + + @Test + public void testBasicRaggedEncoding() { + FrameBlock data = new FrameBlock(new ValueType[]{ValueType.STRING}); + data.setColumnNames(new String[]{"C1"}); // Set column name + data.appendRow(new Object[]{"apple"}); + data.appendRow(new Object[]{"orange"}); + data.appendRow(new Object[]{"apple"}); + data.appendRow(new Object[]{null}); + data.appendRow(new Object[]{"banana"}); + data.appendRow(new Object[]{"orange"}); + data.appendRow(new Object[]{""}); + data.appendRow(new Object[]{"apple"}); + + // Proper JSON syntax + String spec = "{\"ragged\": [\"C1\"]}"; + + testRaggedEncoder(spec, data); + } + + @Test + public void testMixedTypesEncoding() { + FrameBlock data = new FrameBlock(new ValueType[]{ValueType.STRING}); + data.setColumnNames(new String[]{"C1"}); // Set column name + + // All values as strings + data.appendRow(new Object[]{"100"}); + data.appendRow(new Object[]{"100"}); + data.appendRow(new Object[]{"true"}); + data.appendRow(new Object[]{"true"}); + data.appendRow(new Object[]{null}); + data.appendRow(new Object[]{"NA"}); + + String spec = "{\"ragged\": [\"C1\"]}"; + + testRaggedEncoder(spec, data); + } + + @Test + public void testLargeDataset() { + int numRows = 1000; + FrameBlock data = new FrameBlock(new ValueType[]{ValueType.STRING}); + data.setColumnNames(new String[]{"C1"}); // Set column name + + String[] fruits = {"apple", "orange", "banana", "grape", null, ""}; + for (int i = 0; i < numRows; i++) { + data.appendRow(new Object[]{fruits[i % fruits.length]}); + } + + // Proper JSON syntax + String spec = "{\"ragged\": [\"C1\"]}"; + + testRaggedEncoder(spec, data); + } + + @Test + public void testRaggedWithOtherEncoders() { + FrameBlock data = new FrameBlock(new ValueType[]{ValueType.STRING, ValueType.INT32, ValueType.STRING}); + // Set column names + data.setColumnNames(new String[]{"C1", "C2", "C3"}); + + data.appendRow(new Object[]{"apple", 10, "red"}); + data.appendRow(new Object[]{"orange", 20, "orange"}); + data.appendRow(new Object[]{"apple", 15, "red"}); + data.appendRow(new Object[]{null, 5, null}); + data.appendRow(new Object[]{"banana", 25, "yellow"}); + data.appendRow(new Object[]{"orange", 20, "orange"}); + + // Proper JSON syntax with null handling + String spec = "{" + + "\"ragged\": [\"C1\", \"C3\"], " + + "\"bin\": [{" + + "\"id\": \"C2\", " + + "\"method\": \"equi-width\", " + + "\"numbins\": 3, " + + "\"na\": \"impute\" " // Add null handling + + "}]" + + "}"; + + testRaggedEncoder(spec, data); + } + + @Test +public void testRaggedDirectly() { + FrameBlock data = new FrameBlock(new ValueType[]{ValueType.STRING}); + data.appendRow(new Object[]{"apple"}); + data.appendRow(new Object[]{"orange"}); + + // Create ragged encoder directly + ColumnEncoderRagged encoder = new ColumnEncoderRagged(1); + encoder.build(data); + + MatrixBlock out = new MatrixBlock(data.getNumRows(), 1, false); + encoder.apply(data, out, 0); + + System.out.println("Encoded Matrix:"); + System.out.println(out); +} + + private void testRaggedEncoder(String spec, FrameBlock data) { + try { + System.out.println("========== STARTING TEST =========="); + System.out.println("Transform Spec: " + spec); + System.out.println("\n=== INPUT DATA ==="); + System.out.println(data); + + FrameBlock meta = null; + System.out.println("\n=== CREATING ENCODER ==="); + MultiColumnEncoder encoder = EncoderFactory.createEncoder( + spec, data.getColumnNames(), data.getNumColumns(), meta); + + // Print encoder configuration + System.out.println("Encoder Type: " + encoder.getClass().getName()); + System.out.println("Column Encoders:"); + for (ColumnEncoder enc : encoder.getColumnEncoders()) { + System.out.println(" - " + enc.getClass().getSimpleName() + + " for column " + enc.getColID()); + if (enc instanceof ColumnEncoderRagged) { + System.out.println(" Null Index: " + + ((ColumnEncoderRagged) enc).getNullIndex()); + } + } + + System.out.println("\n=== ENCODING DATA ==="); + MatrixBlock encoded = encoder.encode(data); + System.out.println("Encoded Matrix Dimensions: " + + encoded.getNumRows() + " x " + encoded.getNumColumns()); + System.out.println("Encoded Matrix Content:"); + System.out.println(encoded); + + System.out.println("\n=== GETTING METADATA ==="); + meta = encoder.getMetaData(meta); + System.out.println("Metadata Dimensions: " + + meta.getNumRows() + " rows x " + meta.getNumColumns() + " columns"); + + System.out.println("\n=== METADATA CONTENT ==="); + for (int r = 0; r < meta.getNumRows(); r++) { + for (int c = 0; c < meta.getNumColumns(); c++) { + System.out.println("[" + r + "," + c + "]: " + meta.get(r, c)); + } + } + + System.out.println("\n=== RE-APPLYING ENCODING ==="); + MatrixBlock reapplied = encoder.apply(data); + System.out.println("Reapplied Matrix Dimensions: " + + reapplied.getNumRows() + " x " + reapplied.getNumColumns()); + + System.out.println("\n=== COMPARING ENCODED AND REAPPLIED ==="); + // Manual matrix comparison + boolean matricesMatch = true; + if (encoded.getNumRows() != reapplied.getNumRows() || + encoded.getNumColumns() != reapplied.getNumColumns()) { + matricesMatch = false; + System.out.println("Matrix dimensions differ: " + + encoded.getNumRows() + "x" + encoded.getNumColumns() + " vs " + + reapplied.getNumRows() + "x" + reapplied.getNumColumns()); + } else { + for (int i = 0; i < encoded.getNumRows(); i++) { + for (int j = 0; j < encoded.getNumColumns(); j++) { + double val1 = encoded.getDouble(i, j); + double val2 = reapplied.getDouble(i, j); + if (val1 != val2) { + matricesMatch = false; + System.out.println("Difference at [" + i + "," + j + "]: " + + val1 + " vs " + val2); + } + } + } + } + System.out.println("Matrices identical: " + matricesMatch); + + System.out.println("\n=== NULL INDEX ANALYSIS ==="); + int nullIndex = -1; + for (ColumnEncoder enc : encoder.getColumnEncoders()) { + if (enc instanceof ColumnEncoderRagged) { + nullIndex = ((ColumnEncoderRagged) enc).getNullIndex(); + System.out.println("Ragged encoder found. Null index: " + nullIndex); + break; + } + } + + if (nullIndex != -1) { + System.out.println("Checking for null indices in encoded data:"); + for (int i = 0; i < encoded.getNumRows(); i++) { + double val = encoded.getDouble(i, 0); + if (val == nullIndex) { + System.out.println("Row " + i + ": NULL value found (index " + nullIndex + ")"); + } + } + } + + System.out.println("========== TEST COMPLETED ==========\n"); + } + catch(Exception e) { + System.out.println("\n!!! TEST FAILED WITH EXCEPTION !!!"); + e.printStackTrace(); + System.out.println("========== TEST ABORTED ==========\n"); + } +} +} \ No newline at end of file